summaryrefslogtreecommitdiffstats
path: root/src/crypto
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /src/crypto
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/crypto')
-rw-r--r--src/crypto/CMakeLists.txt12
-rw-r--r--src/crypto/crypto_accel.h37
-rw-r--r--src/crypto/crypto_plugin.h36
-rw-r--r--src/crypto/isa-l/CMakeLists.txt36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/.gitignore27
-rw-r--r--src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md39
-rw-r--r--src/crypto/isa-l/isa-l_crypto/Doxyfile31
-rw-r--r--src/crypto/isa-l/isa-l_crypto/LICENSE26
-rw-r--r--src/crypto/isa-l/isa-l_crypto/Makefile.am161
-rw-r--r--src/crypto/isa-l/isa-l_crypto/Makefile.nmake493
-rw-r--r--src/crypto/isa-l/isa-l_crypto/Makefile.unx50
-rw-r--r--src/crypto/isa-l/isa-l_crypto/README.md63
-rw-r--r--src/crypto/isa-l/isa-l_crypto/Release_notes.txt215
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/Makefile.am170
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm1778
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm1748
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm1747
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm1648
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm1779
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm1681
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm1531
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm1506
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm1505
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm1473
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm1530
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm1498
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm1962
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm1896
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm1898
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm1808
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm1963
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm1875
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm1708
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm1653
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm1652
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm1634
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm1708
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm1687
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S215
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S220
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S161
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S140
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S30
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S30
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S30
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S30
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S32
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S32
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c108
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S482
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S157
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S38
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c255
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S430
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S165
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S181
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S588
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S58
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S277
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S134
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S136
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S153
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c72
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S35
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c102
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S214
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S116
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S91
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S247
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S116
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S88
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S232
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S49
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S49
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S49
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S49
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S39
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm377
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm431
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm162
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm162
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm164
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm158
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm161
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm158
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm519
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm137
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm151
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm149
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm147
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm141
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm148
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm102
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c339
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c56
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h466
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c443
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c183
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm202
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm31
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm33
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm31
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm33
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm31
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm33
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm32
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm33
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm31
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm33
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm31
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm33
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm31
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm33
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm32
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm33
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm2130
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm3277
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm291
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm233
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm184
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm118
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c2038
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c322
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c272
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c61
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c78
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm2171
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c1940
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c659
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm4296
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h476
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm328
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm274
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm286
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm68
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h302
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c143
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c125
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c144
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c123
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c116
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c247
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c271
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c106
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h1691
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c145
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c126
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c145
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c124
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c113
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c273
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c105
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h1035
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm78
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm78
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/autogen.sh17
-rw-r--r--src/crypto/isa-l/isa-l_crypto/configure.ac349
-rw-r--r--src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile27
-rw-r--r--src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt25
-rw-r--r--src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c380
-rw-r--r--src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c206
-rw-r--r--src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h52
-rw-r--r--src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c213
-rw-r--r--src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c20
-rw-r--r--src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c20
-rw-r--r--src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c20
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h301
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h165
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h613
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h76
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/aes_xts.h214
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/datastruct.asm79
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/endian_helper.h83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/intrinreg.h65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/md5_mb.h372
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/memcpy.asm615
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h375
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h315
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h327
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/mh_sha256.h315
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h112
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/multibinary.asm517
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm442
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h114
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h450
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h451
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h422
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h155
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/test.h111
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/types.h100
-rw-r--r--src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def80
-rw-r--r--src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in11
-rw-r--r--src/crypto/isa-l/isa-l_crypto/make.inc340
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am98
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c230
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c59
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S248
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S526
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c187
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c263
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c263
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c267
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c291
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c50
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm55
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm73
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm248
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm255
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm315
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c44
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm228
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm239
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm283
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm229
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c159
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c202
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c297
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c229
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c129
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm853
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm783
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm779
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm920
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm80
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c186
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c55
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c53
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S124
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S384
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c53
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S35
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S269
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c141
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c70
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c40
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm506
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm508
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm406
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c387
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm498
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c122
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h308
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm77
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c180
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c430
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c217
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c110
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c240
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c204
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am89
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c53
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h91
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S224
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S482
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S34
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S271
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c154
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c67
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c43
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm706
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm653
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm504
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm702
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c102
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h202
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm76
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c206
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c248
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c107
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c272
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c85
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c138
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am88
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c49
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S731
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c53
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S35
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c143
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c70
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c40
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm557
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm616
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm682
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c188
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm557
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c121
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h318
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm77
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c180
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c410
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c217
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c110
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c240
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c176
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am57
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c37
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S35
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S115
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c222
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c169
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c39
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm122
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c120
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h296
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c314
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm204
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm203
-rw-r--r--src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am130
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S294
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S269
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c93
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S192
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c217
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c208
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S194
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S253
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c265
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c264
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c271
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c281
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c325
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c251
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c259
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm67
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c146
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm74
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm247
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm273
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm271
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm278
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm246
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm248
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm246
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm290
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c159
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c202
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c297
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c233
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c128
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c132
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm563
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm416
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm413
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm518
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c112
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm131
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm318
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm484
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm485
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c220
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am127
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c59
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S238
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S289
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S342
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S380
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c268
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c268
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c273
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c283
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c301
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c262
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c146
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm74
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm253
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm274
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm288
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm295
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm261
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm260
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm246
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm261
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm261
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm301
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c160
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c203
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c300
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c241
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c129
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c132
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm930
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm431
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm426
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm620
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm125
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm361
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm574
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm567
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c204
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am108
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c59
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c210
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S269
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S390
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c269
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c269
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c274
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c323
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c255
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c255
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm72
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm224
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm245
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm270
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm227
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c45
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c42
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c43
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm262
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm270
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm280
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm260
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c160
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c203
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c300
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c270
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c129
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm442
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm424
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm487
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm644
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm252
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c234
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c46
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c38
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm396
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am121
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S387
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S576
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c246
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c241
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c188
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S237
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S344
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S368
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S440
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c284
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c292
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c314
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c145
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm77
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm258
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm276
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm247
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm273
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c160
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c206
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c298
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c128
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c133
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm1035
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm711
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm81
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c207
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c45
-rw-r--r--src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile19
-rw-r--r--src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake58
-rw-r--r--src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c155
-rw-r--r--src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c156
-rw-r--r--src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c156
-rw-r--r--src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c156
-rw-r--r--src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c162
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/check_format.sh87
-rw-r--r--src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk123
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/iindent2
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh56
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh47
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh2
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh63
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/test_checks.sh73
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/test_extended.sh127
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/test_tools.sh11
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh47
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh38
-rw-r--r--src/crypto/isa-l/isal_crypto_accel.cc43
-rw-r--r--src/crypto/isa-l/isal_crypto_accel.h31
-rw-r--r--src/crypto/isa-l/isal_crypto_plugin.cc34
-rw-r--r--src/crypto/isa-l/isal_crypto_plugin.h47
-rw-r--r--src/crypto/openssl/CMakeLists.txt14
-rw-r--r--src/crypto/openssl/openssl_crypto_accel.cc104
-rw-r--r--src/crypto/openssl/openssl_crypto_accel.h32
-rw-r--r--src/crypto/openssl/openssl_crypto_plugin.cc32
-rw-r--r--src/crypto/openssl/openssl_crypto_plugin.h36
-rw-r--r--src/crypto/qat/CMakeLists.txt20
-rw-r--r--src/crypto/qat/qat_crypto_accel.cc42
-rw-r--r--src/crypto/qat/qat_crypto_accel.h35
-rw-r--r--src/crypto/qat/qat_crypto_plugin.cc35
-rw-r--r--src/crypto/qat/qat_crypto_plugin.h42
-rw-r--r--src/crypto/qat/qcccrypto.cc471
-rw-r--r--src/crypto/qat/qcccrypto.h176
533 files changed, 162071 insertions, 0 deletions
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
new file mode 100644
index 000000000..f930886d3
--- /dev/null
+++ b/src/crypto/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_custom_target(crypto_plugins)
+set(crypto_plugin_dir ${CEPH_INSTALL_PKGLIBDIR}/crypto)
+
+add_subdirectory(openssl)
+
+if(HAVE_INTEL AND HAVE_NASM_X64_AVX2 AND (NOT APPLE))
+ add_subdirectory(isa-l)
+endif()
+
+if(WITH_QAT)
+ add_subdirectory(qat)
+endif()
diff --git a/src/crypto/crypto_accel.h b/src/crypto/crypto_accel.h
new file mode 100644
index 000000000..5c1593609
--- /dev/null
+++ b/src/crypto/crypto_accel.h
@@ -0,0 +1,37 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CRYPTO_ACCEL_H
+#define CRYPTO_ACCEL_H
+#include <cstddef>
+#include "include/Context.h"
+
+class CryptoAccel;
+typedef std::shared_ptr<CryptoAccel> CryptoAccelRef;
+
+class CryptoAccel {
+ public:
+ CryptoAccel() {}
+ virtual ~CryptoAccel() {}
+
+ static const int AES_256_IVSIZE = 128/8;
+ static const int AES_256_KEYSIZE = 256/8;
+ virtual bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) = 0;
+ virtual bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) = 0;
+};
+#endif
diff --git a/src/crypto/crypto_plugin.h b/src/crypto/crypto_plugin.h
new file mode 100644
index 000000000..cf22d5cb4
--- /dev/null
+++ b/src/crypto/crypto_plugin.h
@@ -0,0 +1,36 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CRYPTO_PLUGIN_H
+#define CRYPTO_PLUGIN_H
+
+// -----------------------------------------------------------------------------
+#include "common/PluginRegistry.h"
+#include "ostream"
+
+#include "crypto/crypto_accel.h"
+// -----------------------------------------------------------------------------
+
+class CryptoPlugin : public ceph::Plugin {
+
+public:
+ CryptoAccelRef cryptoaccel;
+ explicit CryptoPlugin(CephContext* cct) : Plugin(cct)
+ {}
+ ~CryptoPlugin()
+ {}
+ virtual int factory(CryptoAccelRef *cs,
+ std::ostream *ss) = 0;
+};
+#endif
diff --git a/src/crypto/isa-l/CMakeLists.txt b/src/crypto/isa-l/CMakeLists.txt
new file mode 100644
index 000000000..2a2ec0bc0
--- /dev/null
+++ b/src/crypto/isa-l/CMakeLists.txt
@@ -0,0 +1,36 @@
+set(isal_dir ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto)
+set(CMAKE_ASM_FLAGS "-i ${isal_dir}/aes/ -i ${isal_dir}/include/ ${CMAKE_ASM_FLAGS}")
+
+set(isal_crypto_plugin_srcs
+ isal_crypto_accel.cc
+ isal_crypto_plugin.cc
+ ${isal_dir}/aes/cbc_pre.c
+ ${isal_dir}/aes/cbc_multibinary.asm
+ ${isal_dir}/aes/keyexp_128.asm
+ ${isal_dir}/aes/keyexp_192.asm
+ ${isal_dir}/aes/keyexp_256.asm
+ ${isal_dir}/aes/keyexp_multibinary.asm
+ ${isal_dir}/aes/cbc_dec_128_x4_sse.asm
+ ${isal_dir}/aes/cbc_dec_128_x8_avx.asm
+ ${isal_dir}/aes/cbc_dec_192_x4_sse.asm
+ ${isal_dir}/aes/cbc_dec_192_x8_avx.asm
+ ${isal_dir}/aes/cbc_dec_256_x4_sse.asm
+ ${isal_dir}/aes/cbc_dec_256_x8_avx.asm
+ ${isal_dir}/aes/cbc_enc_128_x4_sb.asm
+ ${isal_dir}/aes/cbc_enc_128_x8_sb.asm
+ ${isal_dir}/aes/cbc_enc_192_x4_sb.asm
+ ${isal_dir}/aes/cbc_enc_192_x8_sb.asm
+ ${isal_dir}/aes/cbc_enc_256_x4_sb.asm
+ ${isal_dir}/aes/cbc_enc_256_x8_sb.asm)
+
+if(HAVE_NASM_X64)
+add_dependencies(crypto_plugins ceph_crypto_isal)
+endif(HAVE_NASM_X64)
+
+add_library(ceph_crypto_isal SHARED ${isal_crypto_plugin_srcs})
+target_include_directories(ceph_crypto_isal PRIVATE ${isal_dir}/include)
+set_target_properties(ceph_crypto_isal PROPERTIES
+ VERSION 1.0.0
+ SOVERSION 1
+ INSTALL_RPATH "")
+install(TARGETS ceph_crypto_isal DESTINATION ${crypto_plugin_dir})
diff --git a/src/crypto/isa-l/isa-l_crypto/.gitignore b/src/crypto/isa-l/isa-l_crypto/.gitignore
new file mode 100644
index 000000000..5d7ff17ad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/.gitignore
@@ -0,0 +1,27 @@
+# Objects
+*~
+*.o
+*.lo
+*.so
+*.dll
+*.exp
+*.lib
+bin
+
+# Autobuild
+Makefile
+Makefile.in
+aclocal.m4
+autom4te.cache
+build-aux
+config.*
+configure
+.deps
+.dirstamp
+.libs
+libtool
+
+# Generated files
+isa-l_crypto.h
+/libisal_crypto.la
+libisal_crypto.pc
diff --git a/src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md b/src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md
new file mode 100644
index 000000000..3e95c0b54
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/CONTRIBUTING.md
@@ -0,0 +1,39 @@
+# Contributing to ISA-L_crypto
+
+Everyone is welcome to contribute. Patches may be submitted using GitHub pull
+requests (PRs). All commits must be signed off by the developer (--signoff)
+which indicates that you agree to the Developer Certificate of Origin. Patch
+discussion will happen directly on the GitHub PR. Design pre-work and general
+discussion occurs on the [mailing list]. Anyone can provide feedback in either
+location and all discussion is welcome. Decisions on whether to merge patches
+will be handled by the maintainer.
+
+## License
+
+ISA-L_crypto is licensed using a BSD 3-clause [license]. All code submitted to
+the project is required to carry that license.
+
+## Certificate of Origin
+
+In order to get a clear contribution chain of trust we use the
+[signed-off-by language] used by the Linux kernel project.
+
+## Mailing List
+
+Contributors and users are welcome to submit new request on our roadmap, submit
+patches, file issues, and ask questions on our [mailing list].
+
+## Coding Style
+
+The coding style for ISA-L_crypto C code roughly follows linux kernel
+guidelines. Use the included indent script to format C code.
+
+ ./tools/iindent your_files.c
+
+And use check format script before submitting.
+
+ ./tools/check_format.sh
+
+[mailing list]:https://lists.01.org/mailman/listinfo/isal
+[license]:LICENSE
+[signed-off-by language]:https://01.org/community/signed-process
diff --git a/src/crypto/isa-l/isa-l_crypto/Doxyfile b/src/crypto/isa-l/isa-l_crypto/Doxyfile
new file mode 100644
index 000000000..9b37aac53
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Doxyfile
@@ -0,0 +1,31 @@
+PROJECT_NAME = "Intel Intelligent Storage Acceleration Library Crypto"
+PROJECT_BRIEF = "ISA-L_crypto API reference doc"
+
+OUTPUT_DIRECTORY = generated_doc
+FULL_PATH_NAMES = NO
+TAB_SIZE = 8
+ALIASES = "requires=\xrefitem requires \"Requires\" \"Instruction Set Requirements for arch-specific functions (non-multibinary)\""
+OPTIMIZE_OUTPUT_FOR_C = YES
+HIDE_UNDOC_MEMBERS = YES
+USE_MDFILE_AS_MAINPAGE = README.md
+
+INPUT = isa-l_crypto.h \
+ include \
+ README.md \
+ CONTRIBUTING.md \
+ Release_notes.txt
+
+EXCLUDE = include/test.h include/memcpy_inline.h include/intrinreg.h include/endian_helper.h
+EXCLUDE_PATTERNS = */include/*_multibinary.h
+EXAMPLE_PATH = . aes md5_mb mh_sha1 mh_sha1_murmur3_x64_128 mh_sha256 rolling_hash sha1_mb sha256_mb sha512_mb
+PAPER_TYPE = letter
+LATEX_SOURCE_CODE = YES
+GENERATE_TREEVIEW = YES
+MACRO_EXPANSION = YES
+EXPAND_ONLY_PREDEF = YES
+PREDEFINED = "DECLARE_ALIGNED(n, a)=ALIGN n" \
+ __declspec(x)='x' \
+ align(x)='ALIGN \
+ x'
+EXPAND_AS_DEFINED = DECLARE_ALIGNED
+EXTENSION_MAPPING = "txt=md"
diff --git a/src/crypto/isa-l/isa-l_crypto/LICENSE b/src/crypto/isa-l/isa-l_crypto/LICENSE
new file mode 100644
index 000000000..ecebef110
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/LICENSE
@@ -0,0 +1,26 @@
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.am b/src/crypto/isa-l/isa-l_crypto/Makefile.am
new file mode 100644
index 000000000..9151aab1b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Makefile.am
@@ -0,0 +1,161 @@
+EXTRA_DIST = autogen.sh Makefile.unx make.inc Makefile.nmake isa-l_crypto.def LICENSE README.md Doxyfile
+CLEANFILES =
+LDADD =
+AM_MAKEFLAGS = --no-print-directory
+noinst_HEADERS =
+pkginclude_HEADERS = include/test.h include/types.h include/endian_helper.h
+noinst_LTLIBRARIES =
+INCLUDE = -I $(srcdir)/include/
+
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = libisal_crypto.pc
+EXTRA_DIST += libisal_crypto.pc.in
+CLEANFILES += libisal_crypto.pc
+
+lsrc=
+src_include=
+extern_hdrs=
+other_src=
+check_tests=
+unit_tests=
+perf_tests=
+unit_tests_extra=
+perf_tests_extra=
+examples=
+other_tests=
+lsrc32=
+lsrc_x86_64=
+lsrc_x86_32=
+lsrc_aarch64=
+lsrc_base_aliases=
+unit_tests32=
+perf_tests32=
+
+# Include units
+include sha1_mb/Makefile.am
+include mh_sha1/Makefile.am
+include md5_mb/Makefile.am
+include sha256_mb/Makefile.am
+include sha512_mb/Makefile.am
+include mh_sha1_murmur3_x64_128/Makefile.am
+include mh_sha256/Makefile.am
+include rolling_hash/Makefile.am
+include sm3_mb/Makefile.am
+include aes/Makefile.am
+
+# LIB version info not necessarily the same as package version
+LIBISAL_CURRENT=2
+LIBISAL_REVISION=24
+LIBISAL_AGE=0
+
+lib_LTLIBRARIES = libisal_crypto.la
+pkginclude_HEADERS += $(sort ${extern_hdrs})
+libisal_crypto_la_SOURCES = ${lsrc}
+if CPU_X86_64
+libisal_crypto_la_SOURCES += ${lsrc_x86_64}
+endif
+
+if CPU_X86_32
+libisal_crypto_la_SOURCES += ${lsrc_x86_32}
+endif
+
+if CPU_AARCH64
+libisal_crypto_la_SOURCES += ${lsrc_aarch64}
+endif
+
+if CPU_UNDEFINED
+libisal_crypto_la_SOURCES += ${lsrc_base_aliases}
+endif
+
+nobase_include_HEADERS = isa-l_crypto.h
+libisal_crypto_la_LDFLAGS = $(AM_LDFLAGS) \
+ -version-info $(LIBISAL_CURRENT):$(LIBISAL_REVISION):$(LIBISAL_AGE)
+libisal_crypto_la_LIBADD = ${noinst_LTLIBRARIES}
+
+EXTRA_DIST += ${other_src}
+EXTRA_DIST += Release_notes.txt
+
+# For tests
+LDADD += libisal_crypto.la
+check_PROGRAMS = ${check_tests}
+TESTS = ${check_tests}
+
+# For additional tests
+EXTRA_PROGRAMS = ${unit_tests}
+EXTRA_PROGRAMS += ${perf_tests}
+EXTRA_PROGRAMS += ${other_tests}
+EXTRA_PROGRAMS += ${examples}
+CLEANFILES += ${EXTRA_PROGRAMS}
+
+perfs: ${perf_tests}
+tests: ${unit_tests}
+checks: ${check_tests}
+other: ${other_tests}
+perf: $(addsuffix .run,$(perf_tests))
+ex: ${examples}
+test: $(addsuffix .run,$(unit_tests))
+
+# Build rule to run tests
+%.run: %
+ $<
+ @echo Completed run: $<
+
+# Support for yasm/nasm
+if INTEL_CET_ENABLED
+ export CET_LD=$(LD)
+endif
+if USE_YASM
+if INTEL_CET_ENABLED
+ as_filter = ${srcdir}/tools/yasm-cet-filter.sh
+else
+ as_filter = ${srcdir}/tools/yasm-filter.sh
+endif
+endif
+if USE_NASM
+if INTEL_CET_ENABLED
+ as_filter = ${srcdir}/tools/nasm-cet-filter.sh
+else
+ as_filter = ${srcdir}/tools/nasm-filter.sh
+endif
+endif
+if CPU_AARCH64
+ as_filter = $(CC) -D__ASSEMBLY__
+endif
+CCAS = $(as_filter)
+EXTRA_DIST += tools/yasm-filter.sh tools/nasm-filter.sh
+EXTRA_DIST += tools/yasm-cet-filter.sh tools/nasm-cet-filter.sh
+
+AM_CFLAGS = ${my_CFLAGS} ${INCLUDE} $(src_include) ${D}
+AM_CCASFLAGS = ${yasm_args} ${INCLUDE} $(src_include) ${DEFS} ${D}
+
+.asm.s:
+ @echo " MKTMP " $@;
+ @cp $< $@
+
+# Generate isa-l_crypto.h
+BUILT_SOURCES = isa-l_crypto.h
+CLEANFILES += isa-l_crypto.h
+isa-l_crypto.h:
+ @echo 'Building $@'
+ @echo '' >> $@
+ @echo '/**' >> $@
+ @echo ' * @file isa-l_crypto.h'>> $@
+ @echo ' * @brief Include for ISA-L_crypto library' >> $@
+ @echo ' */' >> $@
+ @echo '' >> $@
+ @echo '#ifndef _ISAL_CRYPTO_H_' >> $@
+ @echo '#define _ISAL_CRYPTO_H_' >> $@
+ @echo '' >> $@
+ @echo '#define.ISAL_CRYPTO_MAJOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$3}' >> $@
+ @echo '#define.ISAL_CRYPTO_MINOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$4}' >> $@
+ @echo '#define.ISAL_CRYPTO_PATCH_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$5}' >> $@
+ @echo '#define ISAL_CRYPTO_MAKE_VERSION(maj, min, patch) ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@
+ @echo '#define ISAL_CRYPTO_VERSION ISAL_CRYPTO_MAKE_VERSION(ISAL_CRYPTO_MAJOR_VERSION, ISAL_CRYPTO_MINOR_VERSION, ISAL_CRYPTO_PATCH_VERSION)' >> $@
+ @echo '' >> $@
+ @for unit in $(sort $(extern_hdrs)); do echo "#include <isa-l_crypto/$$unit>" | sed -e 's;include/;;' >> $@; done
+ @echo '#endif //_ISAL_CRYPTO_H_' >> $@
+
+doc: isa-l_crypto.h
+ (cat Doxyfile; echo 'PROJECT_NUMBER=${VERSION}') | doxygen -
+ $(MAKE) -C generated_doc/latex &> generated_doc/latex_build_api.log
+ cp generated_doc/latex/refman.pdf isa-l_crypto_api_${VERSION}.pdf
diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.nmake b/src/crypto/isa-l/isa-l_crypto/Makefile.nmake
new file mode 100644
index 000000000..a3e577277
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Makefile.nmake
@@ -0,0 +1,493 @@
+########################################################################
+# Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+# This file can be auto-regenerated with $make -f Makefile.unx Makefile.nmake
+
+objs = \
+ bin\sha1_ctx_sse.obj \
+ bin\sha1_ctx_avx.obj \
+ bin\sha1_ctx_avx2.obj \
+ bin\sha1_ctx_base.obj \
+ bin\sha1_mb_mgr_init_sse.obj \
+ bin\sha1_mb_mgr_init_avx2.obj \
+ bin\sha1_mb_mgr_submit_sse.obj \
+ bin\sha1_mb_mgr_submit_avx.obj \
+ bin\sha1_mb_mgr_submit_avx2.obj \
+ bin\sha1_mb_mgr_flush_sse.obj \
+ bin\sha1_mb_mgr_flush_avx.obj \
+ bin\sha1_mb_mgr_flush_avx2.obj \
+ bin\sha1_mb_x4_sse.obj \
+ bin\sha1_mb_x4_avx.obj \
+ bin\sha1_mb_x8_avx2.obj \
+ bin\sha1_multibinary.obj \
+ bin\sha1_ctx_avx512.obj \
+ bin\sha1_mb_mgr_init_avx512.obj \
+ bin\sha1_mb_mgr_submit_avx512.obj \
+ bin\sha1_mb_mgr_flush_avx512.obj \
+ bin\sha1_mb_x16_avx512.obj \
+ bin\sha1_opt_x1.obj \
+ bin\sha1_ni_x1.obj \
+ bin\sha1_ni_x2.obj \
+ bin\sha1_ctx_sse_ni.obj \
+ bin\sha1_ctx_avx512_ni.obj \
+ bin\sha1_mb_mgr_submit_sse_ni.obj \
+ bin\sha1_mb_mgr_flush_sse_ni.obj \
+ bin\sha1_mb_mgr_flush_avx512_ni.obj \
+ bin\sha256_ctx_sse.obj \
+ bin\sha256_ctx_avx.obj \
+ bin\sha256_ctx_avx2.obj \
+ bin\sha256_ctx_base.obj \
+ bin\sha256_mb_mgr_init_sse.obj \
+ bin\sha256_mb_mgr_init_avx2.obj \
+ bin\sha256_mb_mgr_submit_sse.obj \
+ bin\sha256_mb_mgr_submit_avx.obj \
+ bin\sha256_mb_mgr_submit_avx2.obj \
+ bin\sha256_mb_mgr_flush_sse.obj \
+ bin\sha256_mb_mgr_flush_avx.obj \
+ bin\sha256_mb_mgr_flush_avx2.obj \
+ bin\sha256_mb_x4_sse.obj \
+ bin\sha256_mb_x4_avx.obj \
+ bin\sha256_mb_x8_avx2.obj \
+ bin\sha256_multibinary.obj \
+ bin\sha256_ctx_avx512.obj \
+ bin\sha256_mb_mgr_init_avx512.obj \
+ bin\sha256_mb_mgr_submit_avx512.obj \
+ bin\sha256_mb_mgr_flush_avx512.obj \
+ bin\sha256_mb_x16_avx512.obj \
+ bin\sha256_opt_x1.obj \
+ bin\sha256_ni_x1.obj \
+ bin\sha256_ni_x2.obj \
+ bin\sha256_ctx_sse_ni.obj \
+ bin\sha256_ctx_avx512_ni.obj \
+ bin\sha256_mb_mgr_submit_sse_ni.obj \
+ bin\sha256_mb_mgr_flush_sse_ni.obj \
+ bin\sha256_mb_mgr_flush_avx512_ni.obj \
+ bin\sha512_ctx_sse.obj \
+ bin\sha512_ctx_avx.obj \
+ bin\sha512_ctx_avx2.obj \
+ bin\sha512_ctx_sb_sse4.obj \
+ bin\sha512_ctx_base.obj \
+ bin\sha512_mb_mgr_init_sse.obj \
+ bin\sha512_mb_mgr_init_avx2.obj \
+ bin\sha512_sb_mgr_init_sse4.obj \
+ bin\sha512_mb_mgr_submit_sse.obj \
+ bin\sha512_mb_mgr_submit_avx.obj \
+ bin\sha512_mb_mgr_submit_avx2.obj \
+ bin\sha512_mb_mgr_flush_sse.obj \
+ bin\sha512_mb_mgr_flush_avx.obj \
+ bin\sha512_mb_mgr_flush_avx2.obj \
+ bin\sha512_mb_x2_sse.obj \
+ bin\sha512_mb_x2_avx.obj \
+ bin\sha512_mb_x4_avx2.obj \
+ bin\sha512_multibinary.obj \
+ bin\sha512_sb_mgr_submit_sse4.obj \
+ bin\sha512_sb_mgr_flush_sse4.obj \
+ bin\sha512_sse4.obj \
+ bin\sha512_ctx_avx512.obj \
+ bin\sha512_mb_mgr_init_avx512.obj \
+ bin\sha512_mb_mgr_submit_avx512.obj \
+ bin\sha512_mb_mgr_flush_avx512.obj \
+ bin\sha512_mb_x8_avx512.obj \
+ bin\md5_ctx_sse.obj \
+ bin\md5_ctx_avx.obj \
+ bin\md5_ctx_avx2.obj \
+ bin\md5_ctx_base.obj \
+ bin\md5_mb_mgr_init_sse.obj \
+ bin\md5_mb_mgr_init_avx2.obj \
+ bin\md5_mb_mgr_init_avx512.obj \
+ bin\md5_mb_mgr_submit_sse.obj \
+ bin\md5_mb_mgr_submit_avx.obj \
+ bin\md5_mb_mgr_submit_avx2.obj \
+ bin\md5_mb_mgr_flush_sse.obj \
+ bin\md5_mb_mgr_flush_avx.obj \
+ bin\md5_mb_mgr_flush_avx2.obj \
+ bin\md5_mb_x4x2_sse.obj \
+ bin\md5_mb_x4x2_avx.obj \
+ bin\md5_mb_x8x2_avx2.obj \
+ bin\md5_multibinary.obj \
+ bin\md5_mb_mgr_submit_avx512.obj \
+ bin\md5_mb_mgr_flush_avx512.obj \
+ bin\md5_mb_x16x2_avx512.obj \
+ bin\md5_ctx_avx512.obj \
+ bin\mh_sha1_block_base.obj \
+ bin\mh_sha1_finalize_base.obj \
+ bin\mh_sha1_update_base.obj \
+ bin\sha1_for_mh_sha1.obj \
+ bin\mh_sha1.obj \
+ bin\mh_sha1_multibinary.obj \
+ bin\mh_sha1_block_sse.obj \
+ bin\mh_sha1_block_avx.obj \
+ bin\mh_sha1_block_avx2.obj \
+ bin\mh_sha1_block_avx512.obj \
+ bin\mh_sha1_avx512.obj \
+ bin\murmur3_x64_128_internal.obj \
+ bin\mh_sha1_murmur3_x64_128.obj \
+ bin\mh_sha1_murmur3_x64_128_finalize_base.obj \
+ bin\mh_sha1_murmur3_x64_128_update_base.obj \
+ bin\mh_sha1_murmur3_x64_128_block_sse.obj \
+ bin\mh_sha1_murmur3_x64_128_block_avx.obj \
+ bin\mh_sha1_murmur3_x64_128_block_avx2.obj \
+ bin\mh_sha1_murmur3_x64_128_multibinary.obj \
+ bin\mh_sha1_murmur3_x64_128_avx512.obj \
+ bin\mh_sha1_murmur3_x64_128_block_avx512.obj \
+ bin\sha256_for_mh_sha256.obj \
+ bin\mh_sha256.obj \
+ bin\mh_sha256_block_sse.obj \
+ bin\mh_sha256_block_avx.obj \
+ bin\mh_sha256_block_avx2.obj \
+ bin\mh_sha256_multibinary.obj \
+ bin\mh_sha256_finalize_base.obj \
+ bin\mh_sha256_update_base.obj \
+ bin\mh_sha256_block_base.obj \
+ bin\mh_sha256_block_avx512.obj \
+ bin\mh_sha256_avx512.obj \
+ bin\rolling_hashx_base.obj \
+ bin\rolling_hash2.obj \
+ bin\rolling_hash2_until_04.obj \
+ bin\rolling_hash2_until_00.obj \
+ bin\rolling_hash2_multibinary.obj \
+ bin\sm3_ctx_base.obj \
+ bin\sm3_multibinary.obj \
+ bin\sm3_ctx_avx512.obj \
+ bin\sm3_mb_mgr_submit_avx512.obj \
+ bin\sm3_mb_mgr_flush_avx512.obj \
+ bin\sm3_mb_x16_avx512.obj \
+ bin\sm3_ctx_avx2.obj \
+ bin\sm3_mb_mgr_submit_avx2.obj \
+ bin\sm3_mb_mgr_flush_avx2.obj \
+ bin\sm3_mb_x8_avx2.obj \
+ bin\gcm_multibinary.obj \
+ bin\gcm_pre.obj \
+ bin\gcm128_avx_gen2.obj \
+ bin\gcm128_avx_gen4.obj \
+ bin\gcm128_sse.obj \
+ bin\gcm256_avx_gen2.obj \
+ bin\gcm256_avx_gen4.obj \
+ bin\gcm256_sse.obj \
+ bin\gcm128_vaes_avx512.obj \
+ bin\gcm256_vaes_avx512.obj \
+ bin\gcm128_avx_gen2_nt.obj \
+ bin\gcm128_avx_gen4_nt.obj \
+ bin\gcm128_sse_nt.obj \
+ bin\gcm256_avx_gen2_nt.obj \
+ bin\gcm256_avx_gen4_nt.obj \
+ bin\gcm256_sse_nt.obj \
+ bin\gcm128_vaes_avx512_nt.obj \
+ bin\gcm256_vaes_avx512_nt.obj \
+ bin\gcm_multibinary_nt.obj \
+ bin\keyexp_multibinary.obj \
+ bin\keyexp_128.obj \
+ bin\keyexp_192.obj \
+ bin\keyexp_256.obj \
+ bin\cbc_multibinary.obj \
+ bin\cbc_dec_128_x4_sse.obj \
+ bin\cbc_dec_128_x8_avx.obj \
+ bin\cbc_dec_192_x4_sse.obj \
+ bin\cbc_dec_192_x8_avx.obj \
+ bin\cbc_dec_256_x4_sse.obj \
+ bin\cbc_dec_256_x8_avx.obj \
+ bin\cbc_enc_128_x4_sb.obj \
+ bin\cbc_enc_128_x8_sb.obj \
+ bin\cbc_enc_192_x4_sb.obj \
+ bin\cbc_enc_192_x8_sb.obj \
+ bin\cbc_enc_256_x4_sb.obj \
+ bin\cbc_enc_256_x8_sb.obj \
+ bin\cbc_dec_vaes_avx512.obj \
+ bin\cbc_pre.obj \
+ bin\xts_aes_128_multibinary.obj \
+ bin\XTS_AES_128_dec_sse.obj \
+ bin\XTS_AES_128_dec_expanded_key_sse.obj \
+ bin\XTS_AES_128_enc_sse.obj \
+ bin\XTS_AES_128_enc_expanded_key_sse.obj \
+ bin\XTS_AES_128_dec_avx.obj \
+ bin\XTS_AES_128_dec_expanded_key_avx.obj \
+ bin\XTS_AES_128_enc_avx.obj \
+ bin\XTS_AES_128_enc_expanded_key_avx.obj \
+ bin\xts_aes_256_multibinary.obj \
+ bin\XTS_AES_256_dec_avx.obj \
+ bin\XTS_AES_256_dec_expanded_key_avx.obj \
+ bin\XTS_AES_256_enc_avx.obj \
+ bin\XTS_AES_256_enc_expanded_key_avx.obj \
+ bin\XTS_AES_256_dec_sse.obj \
+ bin\XTS_AES_256_dec_expanded_key_sse.obj \
+ bin\XTS_AES_256_enc_sse.obj \
+ bin\XTS_AES_256_enc_expanded_key_sse.obj \
+ bin\XTS_AES_256_enc_vaes.obj \
+ bin\XTS_AES_128_enc_vaes.obj \
+ bin\XTS_AES_256_enc_expanded_key_vaes.obj \
+ bin\XTS_AES_128_enc_expanded_key_vaes.obj \
+ bin\XTS_AES_256_dec_vaes.obj \
+ bin\XTS_AES_128_dec_vaes.obj \
+ bin\XTS_AES_256_dec_expanded_key_vaes.obj \
+ bin\XTS_AES_128_dec_expanded_key_vaes.obj
+
+INCLUDES = -I./ -Isha1_mb/ -Isha256_mb/ -Isha512_mb/ -Imd5_mb/ -Imh_sha1/ -Imh_sha1_murmur3_x64_128/ -Imh_sha256/ -Irolling_hash/ -Ism3_mb/ -Iaes/ -Iinclude/
+# Modern asm feature level, consider upgrading nasm/yasm before decreasing feature_level
+FEAT_FLAGS = -DHAVE_AS_KNOWS_AVX512 -DAS_FEATURE_LEVEL=10 -DHAVE_AS_KNOWS_SHANI
+CFLAGS_REL = -O2 -DNDEBUG /Z7 /MD /Gy
+CFLAGS_DBG = -Od -DDEBUG /Z7 /MDd
+LINKFLAGS = -nologo -incremental:no -debug
+CFLAGS = $(CFLAGS_REL) -nologo -D_USE_MATH_DEFINES $(FEAT_FLAGS) $(INCLUDES) $(D)
+AFLAGS = -f win64 $(FEAT_FLAGS) $(INCLUDES) $(D)
+CC = cl
+# or CC = icl -Qstd=c99
+AS = nasm
+
+lib: bin static dll
+static: bin isa-l_crypto_static.lib
+dll: bin isa-l_crypto.dll
+
+bin: ; -mkdir $@
+
+isa-l_crypto_static.lib: $(objs)
+ lib -out:$@ @<<
+$?
+<<
+
+isa-l_crypto.dll: $(objs)
+ link -out:$@ -dll -def:isa-l_crypto.def $(LINKFLAGS) @<<
+$?
+<<
+
+{sha1_mb}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{sha1_mb}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{sha256_mb}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{sha256_mb}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{sha512_mb}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{sha512_mb}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{md5_mb}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{md5_mb}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{mh_sha1}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{mh_sha1}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{mh_sha1_murmur3_x64_128}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{mh_sha1_murmur3_x64_128}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{mh_sha256}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{mh_sha256}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{rolling_hash}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{rolling_hash}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{sm3_mb}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{sm3_mb}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{aes}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{aes}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+
+# Examples
+ex = \
+ sha1_multi_buffer_example.exe \
+ gcm_simple_example.exe
+
+ex: lib $(ex)
+
+$(ex): $(@B).obj
+
+.obj.exe:
+ link /out:$@ $(LINKFLAGS) isa-l_crypto.lib $?
+
+# Check tests
+checks = \
+ sha1_mb_test.exe \
+ sha1_mb_rand_test.exe \
+ sha1_mb_rand_update_test.exe \
+ sha1_mb_flush_test.exe \
+ sha256_mb_test.exe \
+ sha256_mb_rand_test.exe \
+ sha256_mb_rand_update_test.exe \
+ sha256_mb_flush_test.exe \
+ sha512_mb_test.exe \
+ sha512_mb_rand_test.exe \
+ sha512_mb_rand_update_test.exe \
+ md5_mb_test.exe \
+ md5_mb_rand_test.exe \
+ md5_mb_rand_update_test.exe \
+ mh_sha1_test.exe \
+ mh_sha256_test.exe \
+ rolling_hash2_test.exe \
+ sm3_ref_test.exe \
+ cbc_std_vectors_test.exe \
+ gcm_std_vectors_test.exe \
+ gcm_nt_std_vectors_test.exe \
+ xts_128_test.exe \
+ xts_256_test.exe \
+ xts_128_expanded_key_test.exe \
+ xts_256_expanded_key_test.exe
+
+checks: lib $(checks)
+$(checks): $(@B).obj
+check: $(checks)
+ !$?
+
+# Unit tests
+tests = \
+ sha1_mb_rand_ssl_test.exe \
+ sha256_mb_rand_ssl_test.exe \
+ sha512_mb_rand_ssl_test.exe \
+ md5_mb_rand_ssl_test.exe \
+ mh_sha1_update_test.exe \
+ mh_sha1_murmur3_x64_128_test.exe \
+ mh_sha1_murmur3_x64_128_update_test.exe \
+ mh_sha256_update_test.exe \
+ sm3_mb_rand_ssl_test.exe \
+ sm3_mb_rand_test.exe \
+ sm3_mb_rand_update_test.exe \
+ sm3_mb_flush_test.exe \
+ sm3_mb_test.exe \
+ cbc_std_vectors_random_test.exe \
+ gcm_std_vectors_random_test.exe \
+ gcm_nt_rand_test.exe \
+ xts_128_rand.exe \
+ xts_128_rand_ossl_test.exe \
+ xts_256_rand.exe \
+ xts_256_rand_ossl_test.exe
+
+tests: lib $(tests)
+$(tests): $(@B).obj
+
+# Performance tests
+perfs = \
+ sha1_mb_vs_ossl_perf.exe \
+ sha1_mb_vs_ossl_shortage_perf.exe \
+ sha256_mb_vs_ossl_perf.exe \
+ sha256_mb_vs_ossl_shortage_perf.exe \
+ sha512_mb_vs_ossl_perf.exe \
+ md5_mb_vs_ossl_perf.exe \
+ mh_sha1_perf.exe \
+ mh_sha1_murmur3_x64_128_perf.exe \
+ mh_sha256_perf.exe \
+ rolling_hash2_perf.exe \
+ sm3_mb_vs_ossl_perf.exe \
+ sm3_mb_vs_ossl_shortage_perf.exe \
+ cbc_ossl_perf.exe \
+ gcm_ossl_perf.exe \
+ xts_128_enc_ossl_perf.exe \
+ xts_256_enc_ossl_perf.exe \
+ xts_128_enc_perf.exe \
+ xts_128_dec_perf.exe \
+ xts_128_dec_ossl_perf.exe \
+ xts_256_enc_perf.exe \
+ xts_256_dec_perf.exe \
+ xts_256_dec_ossl_perf.exe
+
+perfs: lib $(perfs)
+$(perfs): $(@B).obj
+
+progs =
+
+progs: lib $(progs)
+
+clean:
+ -if exist *.obj del *.obj
+ -if exist bin\*.obj del bin\*.obj
+ -if exist isa-l_crypto_static.lib del isa-l_crypto_static.lib
+ -if exist *.exe del *.exe
+ -if exist *.pdb del *.pdb
+ -if exist isa-l_crypto.lib del isa-l_crypto.lib
+ -if exist isa-l_crypto.dll del isa-l_crypto.dll
+ -if exist isa-l_crypto.exp del isa-l_crypto.exp
+
+libcrypto.lib:
+sha1_mb_rand_test.exe: sha1_ref.obj
+sha1_mb_rand_update_test.exe: sha1_ref.obj
+sha1_mb_flush_test.exe: sha1_ref.obj
+sha1_mb_rand_ssl_test.exe: libcrypto.lib
+sha1_mb_vs_ossl_perf.exe: libcrypto.lib
+sha1_mb_vs_ossl_shortage_perf.exe: libcrypto.lib
+sha256_mb_rand_ssl_test.exe: sha256_ref.obj
+sha256_mb_rand_test.exe: sha256_ref.obj
+sha256_mb_rand_update_test.exe: sha256_ref.obj
+sha256_mb_flush_test.exe: sha256_ref.obj
+sha256_mb_rand_ssl_test.exe: libcrypto.lib
+sha256_mb_vs_ossl_perf.exe: libcrypto.lib
+sha256_mb_vs_ossl_shortage_perf.exe: libcrypto.lib
+sha512_mb_rand_test.exe: sha512_ref.obj
+sha512_mb_rand_update_test.exe: sha512_ref.obj
+sha512_mb_rand_ssl_test.exe: libcrypto.lib
+sha512_mb_vs_ossl_perf.exe: libcrypto.lib
+md5_mb_rand_test.exe: md5_ref.obj
+md5_mb_rand_update_test.exe: md5_ref.obj
+md5_mb_rand_ssl_test.exe: libcrypto.lib
+md5_mb_vs_ossl_perf.exe: libcrypto.lib
+mh_sha1_test.exe: mh_sha1_ref.obj
+mh_sha1_update_test.exe: mh_sha1_ref.obj
+mh_sha1_murmur3_x64_128_test.exe: mh_sha1_ref.obj murmur3_x64_128.obj
+mh_sha1_murmur3_x64_128_update_test.exe: mh_sha1_ref.obj murmur3_x64_128.obj
+mh_sha1_murmur3_x64_128_perf.exe: mh_sha1_ref.obj murmur3_x64_128.obj
+mh_sha256_test.exe: mh_sha256_ref.obj
+mh_sha256_update_test.exe: mh_sha256_ref.obj
+sm3_mb_rand_ssl_test.exe: libcrypto.lib
+sm3_mb_rand_ssl_test.exe: sm3_test_helper.obj
+sm3_mb_rand_update_test.exe: libcrypto.lib
+sm3_mb_rand_update_test.exe: sm3_test_helper.obj
+sm3_mb_flush_test.exe: libcrypto.lib
+sm3_mb_flush_test.exe: sm3_test_helper.obj
+sm3_mb_rand_test.exe: libcrypto.lib
+sm3_mb_rand_test.exe: sm3_test_helper.obj
+sm3_mb_vs_ossl_perf.exe: libcrypto.lib
+sm3_mb_vs_ossl_perf.exe: sm3_test_helper.obj
+sm3_mb_vs_ossl_shortage_perf.exe: libcrypto.lib
+sm3_mb_vs_ossl_shortage_perf.exe: sm3_test_helper.obj
+cbc_ossl_perf.exe: libcrypto.lib
+cbc_std_vectors_random_test.exe: libcrypto.lib
+gcm_ossl_perf.exe: libcrypto.lib
+gcm_std_vectors_random_test.exe: libcrypto.lib
+gcm_nt_rand_test.exe: libcrypto.lib
+xts_128_enc_ossl_perf.exe: libcrypto.lib
+xts_128_dec_ossl_perf.exe: libcrypto.lib
+xts_128_rand_ossl_test.exe: libcrypto.lib
+xts_256_enc_ossl_perf.exe: libcrypto.lib
+xts_256_dec_ossl_perf.exe: libcrypto.lib
+xts_256_rand_ossl_test.exe: libcrypto.lib
diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.unx b/src/crypto/isa-l/isa-l_crypto/Makefile.unx
new file mode 100644
index 000000000..7452f71b0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Makefile.unx
@@ -0,0 +1,50 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+host_cpu ?= $(shell uname -m | sed -e 's/amd/x86_/')
+arch ?= $(shell uname | grep -v -e Linux -e BSD )
+
+
+
+units ?=sha1_mb sha256_mb sha512_mb md5_mb mh_sha1 mh_sha1_murmur3_x64_128 \
+ mh_sha256 rolling_hash sm3_mb
+ifneq ($(arch),noarch)
+units +=aes
+endif
+ifeq ($(host_cpu)_$(arch),aarch64_)
+ arch = aarch64
+endif
+default: lib
+include $(foreach unit,$(units), $(unit)/Makefile.am)
+
+# Override individual lib names to make one inclusive library.
+lib_name := bin/isa-l_crypto.a
+
+include make.inc
+include tools/gen_nmake.mk
+VPATH = . $(units) include
diff --git a/src/crypto/isa-l/isa-l_crypto/README.md b/src/crypto/isa-l/isa-l_crypto/README.md
new file mode 100644
index 000000000..f9f560c54
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/README.md
@@ -0,0 +1,63 @@
+Intel(R) Intelligent Storage Acceleration Library Crypto Version
+================================================================
+
+ISA-L_crypto is a collection of optimized low-level functions targeting storage
+applications. ISA-L_crypto includes:
+
+* Multi-buffer hashes - run multiple hash jobs together on one core for much
+ better throughput than single-buffer versions.
+ - SHA1, SHA256, SHA512, MD5, SM3
+
+* Multi-hash - Get the performance of multi-buffer hashing with a single-buffer
+ interface. Specification ref : [Multi-Hash white paper](https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/multi-hash-paper.pdf)
+
+* Multi-hash + murmur - run both together.
+
+* AES - block ciphers
+ - XTS, GCM, CBC
+
+* Rolling hash - Hash input in a window which moves through the input
+
+Also see:
+* [ISA-L_crypto for updates](https://github.com/intel/isa-l_crypto).
+* For non-crypto ISA-L see [isa-l on github](https://github.com/intel/isa-l).
+* The [github wiki](https://github.com/intel/isa-l/wiki) covering isa-l and
+ isa-l crypto.
+* [Contributing](CONTRIBUTING.md).
+
+Building ISA-L
+--------------
+
+### Prerequisites
+
+* Assembler: nasm v2.11.01 or later (nasm v2.13 or better suggested for building in AVX512 support)
+ or yasm version 1.2.0 or later.
+* Compiler: gcc, clang, icc or VC compiler.
+* Make: GNU 'make' or 'nmake' (Windows).
+* Optional: Building with autotools requires autoconf/automake packages.
+
+### Autotools
+To build and install the library with autotools it is usually sufficient to run:
+
+ ./autogen.sh
+ ./configure
+ make
+ sudo make install
+
+### Makefile
+To use a standard makefile run:
+
+ make -f Makefile.unx
+
+### Windows
+On Windows use nmake to build dll and static lib:
+
+ nmake -f Makefile.nmake
+
+### Other make targets
+Other targets include:
+* `make check` : create and run tests
+* `make tests` : create additional unit tests
+* `make perfs` : create included performance tests
+* `make ex` : build examples
+* `make doc` : build API manual
diff --git a/src/crypto/isa-l/isa-l_crypto/Release_notes.txt b/src/crypto/isa-l/isa-l_crypto/Release_notes.txt
new file mode 100644
index 000000000..097107585
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Release_notes.txt
@@ -0,0 +1,215 @@
+================================================================================
+v2.24 Intel Intelligent Storage Acceleration Library Crypto Release Notes
+================================================================================
+
+================================================================================
+RELEASE NOTE CONTENTS
+================================================================================
+1. KNOWN ISSUES
+2. FIXED ISSUES
+3. CHANGE LOG & FEATURES ADDED
+
+================================================================================
+1. KNOWN ISSUES
+================================================================================
+
+* Perf tests do not run in Windows environment.
+
+* 32-bit lib is not supported in Windows.
+
+================================================================================
+2. FIXED ISSUES
+================================================================================
+v2.21
+
+* Put correct vec instruction versions in aes_cbc_enc_{128,192,256}(). May help
+ performance on some systems.
+
+v2.20
+
+* Fix issue with new aes_gcm API, aes_gcm_pre_256 was incorrect.
+
+* Multi-buffer hash max length extended. Previous max length for
+ {sha1,sha256,sha512,md5}_mb was 4095MB. While there is still a 4GB limit for
+ each submit, the total hashed length can now be larger then 4GB.
+
+v2.18
+
+* Fix for multi-buffer hash when total length is above 512MB.
+
+v2.14
+
+* Building in unit directories is no longer supported removing the issue of
+ leftover object files causing the top-level make build to fail.
+
+v2.9
+
+* Multi-buffer MD5 AVX2 tests fixed to work on FreeBSD 9.1 by explicitly aligning
+ structures.
+
+v2.7
+
+* Unit tests and examples are now supported in Windows environment
+
+
+================================================================================
+3. CHANGE LOG & FEATURES ADDED
+================================================================================
+v2.24
+
+* New optimized version of AES-CBC decode
+
+* New AVX2, 8 lane version of multi-buffer SM3
+
+* Added support for big-endian architectures
+
+v2.23
+
+* New optimized versions of block ciphers AES-GCM and AES-XTS.
+
+* New optimized versions of multi-buffer SM3 hashing. Removed experimental
+ status.
+
+v2.22
+
+* New multi-buffer SM3 functions. Experimental base functions only.
+
+* New multi-arch support.
+
+v2.21
+
+* Multi-buffer hash performance improvement for Intel(R) Atom(tm) processors.
+ New by-2 shani versions for multi-buffer sha1 & sha256.
+
+* New base functions for multi-buffer hashes.
+ md5_mb, sha1_mb, sha256_mb, sha512_mb.
+
+v2.20
+
+* New functions
+ - Non-temporal versions of aes_gcm added.
+
+* Multi-buffer hash improvement
+ - Increase max length of hash in {sha1,sha256,sha512,md5}_mb to > 4GB.
+
+v2.19
+
+* Multi-buffer hash (sha1_mb, sha256_mb)
+
+ - Choose fast single buffer routine to do flush operation if lanes aren't full.
+
+ - Add SHA-NI support for Goldmont and Cannonlake.
+
+* AES-GCM interface updates.
+
+ - New interface separates the expanded keys and other context into two
+ structures. The old interface is maintained for backward compatibility.
+
+ - User no longer has to append the GCM_IV_END_MARK manually to then end of iv
+ as this is now done automatically. This update should also improve performance
+ of small packets.
+
+* Rolling hash is released.
+
+v2.18
+
+* New multi-hash SHA256-based version.
+
+v2.16
+
+* Split lib from non-crypto functions.
+
+v2.15
+
+* Multi-buffer hash updates. New AVX512 versions for multi-buffer SHA1, SHA256,
+ SHA512, MD5 and SHA1+murmur3_x64_128 stitched.
+
+* Removes restrictions on AAD length in AES-GCM. Previously AAD length was
+ limited to a multiple of 4 bytes. Now any AAD length is allowed.
+
+* Nasm support. ISA-L ported to build with nasm or yasm assembler.
+
+* Windows DLL support. Windows builds DLL by default.
+
+* The older, deprecated multi-buffer API has been removed.
+
+v2.14
+
+* New multi-hash sha1 function and multi-hash sha1 + murmur3_x64_128 stitched.
+ Multi-hash is designed to give the performance of multi-buffer cryptographic
+ hashes with a synchronous single buffer interface.
+
+* New AES-GCM and AES-CBC functions added.
+
+* Autoconf and autotools build allows easier porting to additional systems.
+ Previous make system still available to embedded users with Makefile.unx.
+
+* The AES key expand functions that were used for AES-XTS with pre-expanded keys
+ now expand the decrypt keys in a different order. The order that decrypt keys
+ are stored and used by XTS_AES_128_dec_expanded_key() is reversed from
+ previous versions to be compatable with CBC and GCM key expansion. The
+ aes_keyexp_*() and XTS_AES_128_dec_expanded_key() functions should work the
+ same when paired together.
+
+* Includes update for building on Mac OS X/darwin systems. Add --target=darwin
+ to ./configure step.
+
+v2.10
+
+* Added multi-buffer MD5 in the new hash API. Includes multi-binary capability,
+ no restriction on update length and other benefits of the CTX API.
+
+v2.9
+
+* New multi-buffer hash API. The new API brings the following new features to
+ multi-buffer hashes. The older API is still included but may be deprecated in
+ future releases.
+
+ - Multibinary functionality. Call one function and the appropriate
+ architecture-specific version is fixed up at runtime.
+
+ - No restriction on update length. Submitting an update block no longer has
+ to have length a multiple of the fundamental block size.
+
+* New expanded key tests added for AES-XTS 128 and 256
+
+v2.7
+
+* New AVX2 versions for mb_md5 and mb_sha512 hashing code have been added.
+
+v2.6
+
+* Update buffer functionality added to mb_md5, mb_sha256 and mb_sha512 hashing
+ code. Requires API changes to current interface to specify job type and total
+ length of hash.
+
+* New AVX2 versions for mb_sha1 and mb_sha256 hashing code have been added.
+
+v2.5
+
+* New feature for multi-buffer SHA-1, update buffer. mb_sha1 non-finalize jobs
+ can now be submitted by setting flags in job structure. Requires API changes
+ to current interface to specify job type and total length of hash.
+
+v2.4
+
+* Added new multi-buffer SHA-512: mb_sha512. SSE, AVX versions.
+
+v2.3
+
+* Added improved AES XTS versions.
+
+v2.2
+
+* Added new AVX versions of multi-buffer hashes
+* Changed type in the interface struct for multi-buffer hashes
+ the len field in the following structures :JOB_SHA1,JOB_MD5,
+ JOB_SHA256 is now a 32-bit int.
+
+v2.0
+
+* Added AES XTS units aes_xts_128, aes_xts_256
+
+v1.3
+
+* Added new multi-buffer units for SHA-256 and MD5: mb_sha256, mb_md5.
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am
new file mode 100644
index 000000000..d1f4e5781
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am
@@ -0,0 +1,170 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+# Assembling AES requires including cbc_common.asm, gcm_defines.asm
+src_include += -I $(srcdir)/aes
+
+extern_hdrs += include/aes_gcm.h include/aes_cbc.h include/aes_xts.h include/aes_keyexp.h
+
+lsrc_x86_64 += aes/gcm_multibinary.asm aes/gcm_pre.c
+lsrc_x86_64 += aes/gcm128_avx_gen2.asm aes/gcm128_avx_gen4.asm aes/gcm128_sse.asm
+lsrc_x86_64 += aes/gcm256_avx_gen2.asm aes/gcm256_avx_gen4.asm aes/gcm256_sse.asm
+lsrc_x86_64 += aes/gcm128_vaes_avx512.asm aes/gcm256_vaes_avx512.asm
+lsrc_x86_64 += aes/gcm128_avx_gen2_nt.asm aes/gcm128_avx_gen4_nt.asm aes/gcm128_sse_nt.asm
+lsrc_x86_64 += aes/gcm256_avx_gen2_nt.asm aes/gcm256_avx_gen4_nt.asm aes/gcm256_sse_nt.asm
+lsrc_x86_64 += aes/gcm128_vaes_avx512_nt.asm aes/gcm256_vaes_avx512_nt.asm
+
+lsrc_x86_64 += aes/gcm_multibinary_nt.asm
+
+lsrc_x86_64 += aes/keyexp_multibinary.asm
+lsrc_x86_64 += aes/keyexp_128.asm aes/keyexp_192.asm aes/keyexp_256.asm
+lsrc_x86_64 += aes/cbc_multibinary.asm
+lsrc_x86_64 += aes/cbc_dec_128_x4_sse.asm aes/cbc_dec_128_x8_avx.asm
+lsrc_x86_64 += aes/cbc_dec_192_x4_sse.asm aes/cbc_dec_192_x8_avx.asm
+lsrc_x86_64 += aes/cbc_dec_256_x4_sse.asm aes/cbc_dec_256_x8_avx.asm
+lsrc_x86_64 += aes/cbc_enc_128_x4_sb.asm aes/cbc_enc_128_x8_sb.asm
+lsrc_x86_64 += aes/cbc_enc_192_x4_sb.asm aes/cbc_enc_192_x8_sb.asm
+lsrc_x86_64 += aes/cbc_enc_256_x4_sb.asm aes/cbc_enc_256_x8_sb.asm
+lsrc_x86_64 += aes/cbc_dec_vaes_avx512.asm
+lsrc_x86_64 += aes/cbc_pre.c
+lsrc_x86_64 += aes/xts_aes_128_multibinary.asm
+lsrc_x86_64 += aes/XTS_AES_128_dec_sse.asm aes/XTS_AES_128_dec_expanded_key_sse.asm
+lsrc_x86_64 += aes/XTS_AES_128_enc_sse.asm aes/XTS_AES_128_enc_expanded_key_sse.asm
+lsrc_x86_64 += aes/XTS_AES_128_dec_avx.asm aes/XTS_AES_128_dec_expanded_key_avx.asm
+lsrc_x86_64 += aes/XTS_AES_128_enc_avx.asm aes/XTS_AES_128_enc_expanded_key_avx.asm
+lsrc_x86_64 += aes/xts_aes_256_multibinary.asm
+lsrc_x86_64 += aes/XTS_AES_256_dec_avx.asm aes/XTS_AES_256_dec_expanded_key_avx.asm
+lsrc_x86_64 += aes/XTS_AES_256_enc_avx.asm aes/XTS_AES_256_enc_expanded_key_avx.asm
+lsrc_x86_64 += aes/XTS_AES_256_dec_sse.asm aes/XTS_AES_256_dec_expanded_key_sse.asm
+lsrc_x86_64 += aes/XTS_AES_256_enc_sse.asm aes/XTS_AES_256_enc_expanded_key_sse.asm
+lsrc_x86_64 += aes/XTS_AES_256_enc_vaes.asm
+lsrc_x86_64 += aes/XTS_AES_128_enc_vaes.asm
+lsrc_x86_64 += aes/XTS_AES_256_enc_expanded_key_vaes.asm
+lsrc_x86_64 += aes/XTS_AES_128_enc_expanded_key_vaes.asm
+lsrc_x86_64 += aes/XTS_AES_256_dec_vaes.asm
+lsrc_x86_64 += aes/XTS_AES_128_dec_vaes.asm
+lsrc_x86_64 += aes/XTS_AES_256_dec_expanded_key_vaes.asm
+lsrc_x86_64 += aes/XTS_AES_128_dec_expanded_key_vaes.asm
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += aes/gcm_pre.c \
+ aes/aarch64/gcm_multibinary_aarch64.S \
+ aes/aarch64/keyexp_multibinary_aarch64.S \
+ aes/aarch64/gcm_aarch64_dispatcher.c \
+ aes/aarch64/keyexp_aarch64_dispatcher.c \
+ aes/aarch64/keyexp_128_aarch64_aes.S \
+ aes/aarch64/keyexp_192_aarch64_aes.S \
+ aes/aarch64/keyexp_256_aarch64_aes.S \
+ aes/aarch64/aes_gcm_aes_finalize_128.S \
+ aes/aarch64/aes_gcm_aes_init.S \
+ aes/aarch64/aes_gcm_enc_dec_128.S \
+ aes/aarch64/aes_gcm_precomp_128.S \
+ aes/aarch64/aes_gcm_update_128.S \
+ aes/aarch64/aes_gcm_aes_finalize_256.S \
+ aes/aarch64/aes_gcm_consts.S \
+ aes/aarch64/aes_gcm_enc_dec_256.S \
+ aes/aarch64/aes_gcm_precomp_256.S \
+ aes/aarch64/aes_gcm_update_256.S \
+ aes/aarch64/xts_aarch64_dispatcher.c \
+ aes/aarch64/xts_aes_128_dec.S \
+ aes/aarch64/xts_aes_128_enc.S \
+ aes/aarch64/xts_keyexp_aes_128_dec.S \
+ aes/aarch64/xts_keyexp_aes_128_enc.S \
+ aes/aarch64/xts_aes_256_dec.S \
+ aes/aarch64/xts_aes_256_enc.S \
+ aes/aarch64/xts_keyexp_aes_256_dec.S \
+ aes/aarch64/xts_keyexp_aes_256_enc.S \
+ aes/aarch64/xts_multibinary_aarch64.S \
+ aes/cbc_pre.c \
+ aes/aarch64/cbc_multibinary_aarch64.S \
+ aes/aarch64/cbc_aarch64_dispatcher.c \
+ aes/aarch64/cbc_enc_aes.S \
+ aes/aarch64/cbc_dec_aes.S
+
+other_src += include/multibinary.asm
+other_src += include/test.h include/types.h include/reg_sizes.asm
+other_src += aes/gcm_defines.asm
+other_src += aes/aes_common.asm
+other_src += aes/clear_regs.asm
+other_src += aes/cbc_common.asm aes/cbc_std_vectors.h
+other_src += aes/gcm_vectors.h aes/ossl_helper.h
+other_src += aes/xts_128_vect.h
+other_src += aes/xts_256_vect.h
+other_src += aes/gcm_sse.asm
+other_src += aes/gcm_avx_gen2.asm
+other_src += aes/gcm_avx_gen4.asm
+other_src += aes/gcm_keys_vaes_avx512.asm
+other_src += aes/gcm_vaes_avx512.asm
+
+check_tests += aes/cbc_std_vectors_test
+check_tests += aes/gcm_std_vectors_test
+check_tests += aes/gcm_nt_std_vectors_test
+check_tests += aes/xts_128_test
+check_tests += aes/xts_256_test
+check_tests += aes/xts_128_expanded_key_test
+check_tests += aes/xts_256_expanded_key_test
+
+unit_tests += aes/cbc_std_vectors_random_test
+unit_tests += aes/gcm_std_vectors_random_test
+unit_tests += aes/gcm_nt_rand_test
+unit_tests += aes/xts_128_rand aes/xts_128_rand_ossl_test
+unit_tests += aes/xts_256_rand aes/xts_256_rand_ossl_test
+
+perf_tests += aes/cbc_ossl_perf
+perf_tests += aes/gcm_ossl_perf
+perf_tests += aes/xts_128_enc_ossl_perf
+perf_tests += aes/xts_256_enc_ossl_perf
+perf_tests += aes/xts_128_enc_perf aes/xts_128_dec_perf aes/xts_128_dec_ossl_perf
+perf_tests += aes/xts_256_enc_perf aes/xts_256_dec_perf aes/xts_256_dec_ossl_perf
+
+examples += aes/gcm_simple_example
+
+cbc_ossl_perf: LDLIBS += -lcrypto
+aes_cbc_ossl_perf_LDFLAGS = -lcrypto
+cbc_std_vectors_random_test: LDLIBS += -lcrypto
+aes_cbc_std_vectors_random_test_LDFLAGS = -lcrypto
+gcm_ossl_perf: LDLIBS += -lcrypto
+aes_gcm_ossl_perf_LDFLAGS = -lcrypto
+gcm_std_vectors_random_test: LDLIBS += -lcrypto
+aes_gcm_std_vectors_random_test_LDFLAGS = -lcrypto
+gcm_nt_rand_test: LDLIBS += -lcrypto
+aes_gcm_nt_rand_test_LDFLAGS = -lcrypto
+xts_128_enc_ossl_perf: LDLIBS += -lcrypto
+aes_xts_128_enc_ossl_perf_LDFLAGS = -lcrypto
+xts_128_dec_ossl_perf: LDLIBS += -lcrypto
+aes_xts_128_dec_ossl_perf_LDFLAGS = -lcrypto
+xts_128_rand_ossl_test: LDLIBS += -lcrypto
+aes_xts_128_rand_ossl_test_LDFLAGS = -lcrypto
+xts_256_enc_ossl_perf : LDLIBS += -lcrypto
+aes_xts_256_enc_ossl_perf_LDFLAGS = -lcrypto
+xts_256_dec_ossl_perf : LDLIBS += -lcrypto
+aes_xts_256_dec_ossl_perf_LDFLAGS = -lcrypto
+xts_256_rand_ossl_test: LDLIBS += -lcrypto
+aes_xts_256_rand_ossl_test_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm
new file mode 100644
index 000000000..85582c0df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm
@@ -0,0 +1,1778 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_avx(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%xtmp2 %6
+%define %%ptr_key2 %7
+%define %%ptr_key1 %8
+%define %%ptr_expanded_keys %9
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values are generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_avx, function
+XTS_AES_128_dec_avx:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;decrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm
new file mode 100644
index 000000000..faa7e895e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm
@@ -0,0 +1,1748 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_expanded_key_avx, function
+XTS_AES_128_dec_expanded_key_avx:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm
new file mode 100644
index 000000000..0b1b637be
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm
@@ -0,0 +1,1747 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*0]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_expanded_key_sse, function
+XTS_AES_128_dec_expanded_key_sse:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm
new file mode 100644
index 000000000..7f243949a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_vaes.asm
@@ -0,0 +1,1648 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly zmm25
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; Original way to generate initial tweak values and load plaintext values
+; only used for small blocks
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; Original decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values can be generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+; Decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight_zmm 6
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%TW1 %3 ; tweak 1
+%define %%TW2 %4 ; tweak 2
+%define %%T0 %5 ; Temp register
+%define %%last_eight %6
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW1, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW1, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW2, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW2, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+%endmacro
+
+
+; Decrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_16_zmm 10
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+
+%define %%TW1 %5 ; tweak 1
+%define %%TW2 %6 ; tweak 2
+%define %%TW3 %7 ; tweak 3
+%define %%TW4 %8 ; tweak 4
+
+%define %%T0 %9 ; Temp register
+%define %%last_eight %10
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+ vpxorq %%ST3, %%T0
+ vpxorq %%ST4, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW3, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW3, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW4, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW4, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm15, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm17, zmm15, 1
+ vpxord zmm17, zmm17, zmm14
+%endif
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm16, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm18, zmm16, 1
+ vpxord zmm18, zmm18, zmm14
+%endif
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+ vmovdqa32 %%TW3, zmm17
+ vmovdqa32 %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_expanded_key_vaes, function
+XTS_AES_128_dec_expanded_key_vaes:
+ endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+ push rbp
+ mov rbp, rsp
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+%else
+ sub rsp, VARIABLE_OFFSET
+%endif
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+ cmp N_val, 128
+ jl _less_than_128_bytes
+
+ vpbroadcastq zpoly, ghash_poly_8b
+
+ cmp N_val, 256
+ jge _start_by16
+
+ cmp N_val, 128
+ jge _start_by8
+
+_do_n_blocks:
+ cmp N_val, 0
+ je _ret_
+
+ cmp N_val, (7*16)
+ jge _remaining_num_blocks_is_7
+
+ cmp N_val, (6*16)
+ jge _remaining_num_blocks_is_6
+
+ cmp N_val, (5*16)
+ jge _remaining_num_blocks_is_5
+
+ cmp N_val, (4*16)
+ jge _remaining_num_blocks_is_4
+
+ cmp N_val, (3*16)
+ jge _remaining_num_blocks_is_3
+
+ cmp N_val, (2*16)
+ jge _remaining_num_blocks_is_2
+
+ cmp N_val, (1*16)
+ jge _remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+ vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext - 16], xmm1
+ vmovdqa xmm8, xmm1
+
+ ; Calc previous tweak
+ mov tmp1, 1
+ kmovq k1, tmp1
+ vpsllq xmm13, xmm9, 63
+ vpsraq xmm14, xmm13, 63
+ vpandq xmm5, xmm14, XWORD(zpoly)
+ vpxorq xmm9 {k1}, xmm9, xmm5
+ vpsrldq xmm10, xmm9, 8
+ vpshrdq xmm0, xmm9, xmm10, 1
+ vpslldq xmm13, xmm13, 8
+ vpxorq xmm0, xmm0, xmm13
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_7:
+ mov tmp1, -1
+ shr tmp1, 16
+ kmovq k1, tmp1
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*7
+ and N_val, 15
+ je _done_7_remain
+ vextracti32x4 xmm12, zmm10, 2
+ vextracti32x4 xmm13, zmm10, 3
+ vinserti32x4 zmm10, xmm13, 2
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ add ptr_ciphertext, 16*7
+ vextracti32x4 xmm8, zmm2, 0x2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_7_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_6:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 ymm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*6
+ and N_val, 15
+ je _done_6_remain
+ vextracti32x4 xmm12, zmm10, 1
+ vextracti32x4 xmm13, zmm10, 2
+ vinserti32x4 zmm10, xmm13, 1
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ add ptr_ciphertext, 16*6
+ vextracti32x4 xmm8, zmm2, 0x1
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_6_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ jmp _ret_
+
+_remaining_num_blocks_is_5:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*5
+ and N_val, 15
+ je _done_5_remain
+ vmovdqa xmm12, xmm10
+ vextracti32x4 xmm10, zmm10, 1
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_5_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_4:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ add ptr_plaintext, 16*4
+ and N_val, 15
+ je _done_4_remain
+ vextracti32x4 xmm12, zmm9, 3
+ vinserti32x4 zmm9, xmm10, 3
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ add ptr_ciphertext, 16*4
+ vextracti32x4 xmm8, zmm1, 0x3
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_4_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ jmp _ret_
+
+_remaining_num_blocks_is_3:
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ add ptr_plaintext, 16*3
+ and N_val, 15
+ je _done_3_remain
+ vextracti32x4 xmm13, zmm9, 2
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+ vmovdqa xmm0, xmm13
+ jmp _steal_cipher
+_done_3_remain:
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ jmp _ret_
+
+_remaining_num_blocks_is_2:
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ add ptr_plaintext, 16*2
+ and N_val, 15
+ je _done_2_remain
+ vextracti32x4 xmm10, zmm9, 2
+ vextracti32x4 xmm12, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_2_remain:
+ vextracti32x4 xmm10, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_1:
+ vmovdqu xmm1, [ptr_plaintext]
+ add ptr_plaintext, 16
+ and N_val, 15
+ je _done_1_remain
+ vextracti32x4 xmm11, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+ vmovdqa xmm8, xmm1
+ vmovdqa xmm0, xmm9
+ jmp _steal_cipher
+_done_1_remain:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ jmp _ret_
+
+
+
+_start_by16:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+ ; Make next 8 tweek values by all x 2^8
+ vpsrldq zmm13, zmm9, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm11, zmm9, 1
+ vpxord zmm11, zmm11, zmm14
+
+ vpsrldq zmm15, zmm10, 15
+ vpclmulqdq zmm16, zmm15, zpoly, 0
+ vpslldq zmm12, zmm10, 1
+ vpxord zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ vmovdqu8 zmm3, [ptr_plaintext+16*8]
+ vmovdqu8 zmm4, [ptr_plaintext+16*12]
+ add ptr_plaintext, 256
+
+ decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ vmovdqu8 [ptr_ciphertext+16*8], zmm3
+ vmovdqu8 [ptr_ciphertext+16*12], zmm4
+ add ptr_ciphertext, 256
+ sub N_val, 256
+ cmp N_val, 256
+ jge _main_loop_run_16
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ jmp _do_n_blocks
+
+_start_by8:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 128
+
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ add ptr_ciphertext, 128
+ sub N_val, 128
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ jmp _do_n_blocks
+
+_steal_cipher:
+ ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm10, [twtempl+N_val]
+ vpshufb xmm8, xmm10
+
+ vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
+ vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm10, [twtempl]
+ vpxor xmm10, [mask1]
+ vpshufb xmm3, xmm10
+
+ vpblendvb xmm3, xmm3, xmm2, xmm10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm3, xmm0
+
+ ;decrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm8, xmm0
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext - 16], xmm8
+
+_ret_:
+ mov rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+ add rsp, VARIABLE_OFFSET
+%else
+ mov rsp, rbp
+ pop rbp
+%endif
+ ret
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ and N_val, 15
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa64 xmm16, xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*7
+ vmovdqa64 xmm0, xmm16
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ and N_val, 15
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm15, xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm0, xmm15
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ and N_val, 15
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm14, xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm0, xmm14
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ and N_val, 15
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm13, xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm0, xmm13
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ and N_val, 15
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm12, xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm0, xmm12
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ and N_val, 15
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm11, xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm0, xmm11
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ and N_val, 15
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm0, xmm10
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_128_dec_expanded_key_vaes
+no_XTS_AES_128_dec_expanded_key_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm
new file mode 100644
index 000000000..19f887c2f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm
@@ -0,0 +1,1779 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_sse(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%xtmp2 %6
+%define %%ptr_key2 %7
+%define %%ptr_key1 %8
+%define %%ptr_expanded_keys %9
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*0], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values are generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_sse, function
+XTS_AES_128_dec_sse:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;decrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm
new file mode 100644
index 000000000..e3435dd83
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_vaes.asm
@@ -0,0 +1,1681 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_vavx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly zmm25
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%xtmp2 %6
+%define %%ptr_key2 %7
+%define %%ptr_key1 %8
+%define %%ptr_expanded_keys %9
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; Original way to generate initial tweak values and load plaintext values
+; only used for small blocks
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; Original decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values can be generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+; Decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight_zmm 6
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%TW1 %3 ; tweak 1
+%define %%TW2 %4 ; tweak 2
+%define %%T0 %5 ; Temp register
+%define %%last_eight %6
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW1, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW1, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW2, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW2, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+%endmacro
+
+
+; Decrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_16_zmm 10
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+
+%define %%TW1 %5 ; tweak 1
+%define %%TW2 %6 ; tweak 2
+%define %%TW3 %7 ; tweak 3
+%define %%TW4 %8 ; tweak 4
+
+%define %%T0 %9 ; Temp register
+%define %%last_eight %10
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+ vpxorq %%ST3, %%T0
+ vpxorq %%ST4, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW3, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW3, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW4, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW4, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm15, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm17, zmm15, 1
+ vpxord zmm17, zmm17, zmm14
+%endif
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm16, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm18, zmm16, 1
+ vpxord zmm18, zmm18, zmm14
+%endif
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+ vmovdqa32 %%TW3, zmm17
+ vmovdqa32 %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_dec_vaes, function
+XTS_AES_128_dec_vaes:
+ endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+ push rbp
+ mov rbp, rsp
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+%else
+ sub rsp, VARIABLE_OFFSET
+%endif
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+ cmp N_val, 128
+ jl _less_than_128_bytes
+
+ vpbroadcastq zpoly, ghash_poly_8b
+
+ cmp N_val, 256
+ jge _start_by16
+
+ cmp N_val, 128
+ jge _start_by8
+
+_do_n_blocks:
+ cmp N_val, 0
+ je _ret_
+
+ cmp N_val, (7*16)
+ jge _remaining_num_blocks_is_7
+
+ cmp N_val, (6*16)
+ jge _remaining_num_blocks_is_6
+
+ cmp N_val, (5*16)
+ jge _remaining_num_blocks_is_5
+
+ cmp N_val, (4*16)
+ jge _remaining_num_blocks_is_4
+
+ cmp N_val, (3*16)
+ jge _remaining_num_blocks_is_3
+
+ cmp N_val, (2*16)
+ jge _remaining_num_blocks_is_2
+
+ cmp N_val, (1*16)
+ jge _remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+ vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext - 16], xmm1
+ vmovdqa xmm8, xmm1
+
+ ; Calc previous tweak
+ mov tmp1, 1
+ kmovq k1, tmp1
+ vpsllq xmm13, xmm9, 63
+ vpsraq xmm14, xmm13, 63
+ vpandq xmm5, xmm14, XWORD(zpoly)
+ vpxorq xmm9 {k1}, xmm9, xmm5
+ vpsrldq xmm10, xmm9, 8
+ vpshrdq xmm0, xmm9, xmm10, 1
+ vpslldq xmm13, xmm13, 8
+ vpxorq xmm0, xmm0, xmm13
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_7:
+ mov tmp1, -1
+ shr tmp1, 16
+ kmovq k1, tmp1
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*7
+ and N_val, 15
+ je _done_7_remain
+ vextracti32x4 xmm12, zmm10, 2
+ vextracti32x4 xmm13, zmm10, 3
+ vinserti32x4 zmm10, xmm13, 2
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ add ptr_ciphertext, 16*7
+ vextracti32x4 xmm8, zmm2, 0x2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_7_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_6:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 ymm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*6
+ and N_val, 15
+ je _done_6_remain
+ vextracti32x4 xmm12, zmm10, 1
+ vextracti32x4 xmm13, zmm10, 2
+ vinserti32x4 zmm10, xmm13, 1
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ add ptr_ciphertext, 16*6
+ vextracti32x4 xmm8, zmm2, 0x1
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_6_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ jmp _ret_
+
+_remaining_num_blocks_is_5:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*5
+ and N_val, 15
+ je _done_5_remain
+ vmovdqa xmm12, xmm10
+ vextracti32x4 xmm10, zmm10, 1
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_5_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_4:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ add ptr_plaintext, 16*4
+ and N_val, 15
+ je _done_4_remain
+ vextracti32x4 xmm12, zmm9, 3
+ vinserti32x4 zmm9, xmm10, 3
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ add ptr_ciphertext, 16*4
+ vextracti32x4 xmm8, zmm1, 0x3
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_4_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ jmp _ret_
+
+_remaining_num_blocks_is_3:
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ add ptr_plaintext, 16*3
+ and N_val, 15
+ je _done_3_remain
+ vextracti32x4 xmm13, zmm9, 2
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+ vmovdqa xmm0, xmm13
+ jmp _steal_cipher
+_done_3_remain:
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ jmp _ret_
+
+_remaining_num_blocks_is_2:
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ add ptr_plaintext, 16*2
+ and N_val, 15
+ je _done_2_remain
+ vextracti32x4 xmm10, zmm9, 2
+ vextracti32x4 xmm12, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_2_remain:
+ vextracti32x4 xmm10, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_1:
+ vmovdqu xmm1, [ptr_plaintext]
+ add ptr_plaintext, 16
+ and N_val, 15
+ je _done_1_remain
+ vextracti32x4 xmm11, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+ vmovdqa xmm8, xmm1
+ vmovdqa xmm0, xmm9
+ jmp _steal_cipher
+_done_1_remain:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ jmp _ret_
+
+
+
+_start_by16:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+ ; Make next 8 tweek values by all x 2^8
+ vpsrldq zmm13, zmm9, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm11, zmm9, 1
+ vpxord zmm11, zmm11, zmm14
+
+ vpsrldq zmm15, zmm10, 15
+ vpclmulqdq zmm16, zmm15, zpoly, 0
+ vpslldq zmm12, zmm10, 1
+ vpxord zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ vmovdqu8 zmm3, [ptr_plaintext+16*8]
+ vmovdqu8 zmm4, [ptr_plaintext+16*12]
+ add ptr_plaintext, 256
+
+ decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ vmovdqu8 [ptr_ciphertext+16*8], zmm3
+ vmovdqu8 [ptr_ciphertext+16*12], zmm4
+ add ptr_ciphertext, 256
+ sub N_val, 256
+ cmp N_val, 256
+ jge _main_loop_run_16
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ jmp _do_n_blocks
+
+_start_by8:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 128
+
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ add ptr_ciphertext, 128
+ sub N_val, 128
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ jmp _do_n_blocks
+
+_steal_cipher:
+ ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm10, [twtempl+N_val]
+ vpshufb xmm8, xmm10
+
+ vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
+ vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm10, [twtempl]
+ vpxor xmm10, [mask1]
+ vpshufb xmm3, xmm10
+
+ vpblendvb xmm3, xmm3, xmm2, xmm10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm3, xmm0
+
+ ;decrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm8, xmm0
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext - 16], xmm8
+
+_ret_:
+ mov rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+ add rsp, VARIABLE_OFFSET
+%else
+ mov rsp, rbp
+ pop rbp
+%endif
+ ret
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ and N_val, 15
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa64 xmm16, xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*7
+ vmovdqa64 xmm0, xmm16
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ and N_val, 15
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm15, xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm0, xmm15
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ and N_val, 15
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm14, xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm0, xmm14
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ and N_val, 15
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm13, xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm0, xmm13
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ and N_val, 15
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm12, xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm0, xmm12
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ and N_val, 15
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm11, xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm0, xmm11
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ and N_val, 15
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm0, xmm10
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_128_dec_vaes
+no_XTS_AES_128_dec_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm
new file mode 100644
index 000000000..819617283
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm
@@ -0,0 +1,1531 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_avx(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*3], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*4], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*5], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*6], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*7], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*8], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*9], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*10], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_avx, function
+XTS_AES_128_enc_avx:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm
new file mode 100644
index 000000000..f0f5f02f5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm
@@ -0,0 +1,1506 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+default rel
+
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_expanded_key_avx, function
+XTS_AES_128_enc_expanded_key_avx:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm
new file mode 100644
index 000000000..8ac162c4c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm
@@ -0,0 +1,1505 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_expanded_key_sse, function
+XTS_AES_128_enc_expanded_key_sse:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm
new file mode 100644
index 000000000..730fdcba9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_vaes.asm
@@ -0,0 +1,1473 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_vaes(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly zmm25
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight_zmm 6
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%TW1 %3 ; tweak 1
+%define %%TW2 %4 ; tweak 2
+%define %%T0 %5 ; Temp register
+%define %%last_eight %6
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW1, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW1, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW2, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW2, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+%endmacro
+
+
+; Encrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_16_zmm 10
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+
+%define %%TW1 %5 ; tweak 1
+%define %%TW2 %6 ; tweak 2
+%define %%TW3 %7 ; tweak 3
+%define %%TW4 %8 ; tweak 4
+
+%define %%T0 %9 ; Temp register
+%define %%last_eight %10
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+ vpxorq %%ST3, %%T0
+ vpxorq %%ST4, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW3, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW3, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW4, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW4, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm15, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm17, zmm15, 1
+ vpxord zmm17, zmm17, zmm14
+%endif
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm16, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm18, zmm16, 1
+ vpxord zmm18, zmm18, zmm14
+%endif
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+ vmovdqa32 %%TW3, zmm17
+ vmovdqa32 %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_expanded_key_vaes, function
+XTS_AES_128_enc_expanded_key_vaes:
+ endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+ push rbp
+ mov rbp, rsp
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+%else
+ sub rsp, VARIABLE_OFFSET
+%endif
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+ cmp N_val, 128
+ jl _less_than_128_bytes
+
+ vpbroadcastq zpoly, ghash_poly_8b
+
+ cmp N_val, 256
+ jge _start_by16
+
+ cmp N_val, 128
+ jge _start_by8
+
+_do_n_blocks:
+ cmp N_val, 0
+ je _ret_
+
+ cmp N_val, (7*16)
+ jge _remaining_num_blocks_is_7
+
+ cmp N_val, (6*16)
+ jge _remaining_num_blocks_is_6
+
+ cmp N_val, (5*16)
+ jge _remaining_num_blocks_is_5
+
+ cmp N_val, (4*16)
+ jge _remaining_num_blocks_is_4
+
+ cmp N_val, (3*16)
+ jge _remaining_num_blocks_is_3
+
+ cmp N_val, (2*16)
+ jge _remaining_num_blocks_is_2
+
+ cmp N_val, (1*16)
+ jge _remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+ vmovdqa xmm8, xmm0
+ vmovdqa xmm0, xmm9
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_7:
+ mov tmp1, -1
+ shr tmp1, 16
+ kmovq k1, tmp1
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*7
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ add ptr_ciphertext, 16*7
+
+ vextracti32x4 xmm8, zmm2, 0x2
+ vextracti32x4 xmm0, zmm10, 0x3
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_6:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 ymm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*6
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ add ptr_ciphertext, 16*6
+
+ vextracti32x4 xmm8, zmm2, 0x1
+ vextracti32x4 xmm0, zmm10, 0x2
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_5:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*5
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ add ptr_ciphertext, 16*5
+
+ movdqa xmm8, xmm2
+ vextracti32x4 xmm0, zmm10, 0x1
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_4:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ add ptr_plaintext, 16*4
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ add ptr_ciphertext, 16*4
+
+ vextracti32x4 xmm8, zmm1, 0x3
+ vextracti32x4 xmm0, zmm10, 0x0
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_3:
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 2
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ vmovdqa xmm8, xmm3
+ vextracti32x4 xmm0, zmm9, 3
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_2:
+ vextracti32x4 xmm10, zmm9, 1
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*2
+
+ vmovdqa xmm8, xmm2
+ vextracti32x4 xmm0, zmm9, 2
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_1:
+ vmovdqu xmm1, [ptr_plaintext]
+ add ptr_plaintext, 16
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ vmovdqa xmm8, xmm1
+ vextracti32x4 xmm0, zmm9, 1
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+
+_start_by16:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+ ; Make next 8 tweek values by all x 2^8
+ vpsrldq zmm13, zmm9, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm11, zmm9, 1
+ vpxord zmm11, zmm11, zmm14
+
+ vpsrldq zmm15, zmm10, 15
+ vpclmulqdq zmm16, zmm15, zpoly, 0
+ vpslldq zmm12, zmm10, 1
+ vpxord zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ vmovdqu8 zmm3, [ptr_plaintext+16*8]
+ vmovdqu8 zmm4, [ptr_plaintext+16*12]
+ add ptr_plaintext, 256
+
+ encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ vmovdqu8 [ptr_ciphertext+16*8], zmm3
+ vmovdqu8 [ptr_ciphertext+16*12], zmm4
+ add ptr_ciphertext, 256
+ sub N_val, 256
+
+ cmp N_val, 256
+ jge _main_loop_run_16
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block
+ jmp _do_n_blocks
+
+_start_by8:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+ ; load plaintext
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 128
+
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+ ; store ciphertext
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ add ptr_ciphertext, 128
+ sub N_val, 128
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block
+ jmp _do_n_blocks
+
+_steal_cipher_next:
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+ vmovdqa xmm0, [TW]
+
+_steal_cipher:
+ ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm10, [twtempl+N_val]
+ vpshufb xmm8, xmm10
+
+ vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
+ vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm10, [twtempl]
+ vpxor xmm10, [mask1]
+ vpshufb xmm3, xmm10
+
+ vpblendvb xmm3, xmm3, xmm2, xmm10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm3, xmm0
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm8, xmm0
+
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext - 16], xmm8
+
+_ret_:
+ mov rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+ add rsp, VARIABLE_OFFSET
+%else
+ mov rsp, rbp
+ pop rbp
+%endif
+ ret
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm1
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_128_enc_expanded_key_vaes
+no_XTS_AES_128_enc_expanded_key_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm
new file mode 100644
index 000000000..cbb98cc38
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm
@@ -0,0 +1,1530 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_sse(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*3], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*4], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*5], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*6], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*7], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*8], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*9], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*10], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_sse, function
+XTS_AES_128_enc_sse:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm
new file mode 100644
index 000000000..3532ddda5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_vaes.asm
@@ -0,0 +1,1498 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_vavx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly zmm25
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*3], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*4], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*5], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*6], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*7], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*8], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*9], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*10], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight_zmm 6
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%TW1 %3 ; tweak 1
+%define %%TW2 %4 ; tweak 2
+%define %%T0 %5 ; Temp register
+%define %%last_eight %6
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW1, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW1, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW2, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW2, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+%endmacro
+
+
+; Encrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_16_zmm 10
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+
+%define %%TW1 %5 ; tweak 1
+%define %%TW2 %6 ; tweak 2
+%define %%TW3 %7 ; tweak 3
+%define %%TW4 %8 ; tweak 4
+
+%define %%T0 %9 ; Temp register
+%define %%last_eight %10
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+ vpxorq %%ST3, %%T0
+ vpxorq %%ST4, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW3, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW3, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW4, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW4, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm15, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm17, zmm15, 1
+ vpxord zmm17, zmm17, zmm14
+%endif
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm16, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm18, zmm16, 1
+ vpxord zmm18, zmm18, zmm14
+%endif
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+ vmovdqa32 %%TW3, zmm17
+ vmovdqa32 %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_128_enc_vaes, function
+XTS_AES_128_enc_vaes:
+ endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+ push rbp
+ mov rbp, rsp
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+%else
+ sub rsp, VARIABLE_OFFSET
+%endif
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+ cmp N_val, 128
+ jl _less_than_128_bytes
+
+ vpbroadcastq zpoly, ghash_poly_8b
+
+ cmp N_val, 256
+ jge _start_by16
+
+ cmp N_val, 128
+ jge _start_by8
+
+_do_n_blocks:
+ cmp N_val, 0
+ je _ret_
+
+ cmp N_val, (7*16)
+ jge _remaining_num_blocks_is_7
+
+ cmp N_val, (6*16)
+ jge _remaining_num_blocks_is_6
+
+ cmp N_val, (5*16)
+ jge _remaining_num_blocks_is_5
+
+ cmp N_val, (4*16)
+ jge _remaining_num_blocks_is_4
+
+ cmp N_val, (3*16)
+ jge _remaining_num_blocks_is_3
+
+ cmp N_val, (2*16)
+ jge _remaining_num_blocks_is_2
+
+ cmp N_val, (1*16)
+ jge _remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+ vmovdqa xmm8, xmm0
+ vmovdqa xmm0, xmm9
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_7:
+ mov tmp1, -1
+ shr tmp1, 16
+ kmovq k1, tmp1
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*7
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ add ptr_ciphertext, 16*7
+
+ vextracti32x4 xmm8, zmm2, 0x2
+ vextracti32x4 xmm0, zmm10, 0x3
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_6:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 ymm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*6
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ add ptr_ciphertext, 16*6
+
+ vextracti32x4 xmm8, zmm2, 0x1
+ vextracti32x4 xmm0, zmm10, 0x2
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_5:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*5
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ add ptr_ciphertext, 16*5
+
+ movdqa xmm8, xmm2
+ vextracti32x4 xmm0, zmm10, 0x1
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_4:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ add ptr_plaintext, 16*4
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ add ptr_ciphertext, 16*4
+
+ vextracti32x4 xmm8, zmm1, 0x3
+ vextracti32x4 xmm0, zmm10, 0x0
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_3:
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 2
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ vmovdqa xmm8, xmm3
+ vextracti32x4 xmm0, zmm9, 3
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_2:
+ vextracti32x4 xmm10, zmm9, 1
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*2
+
+ vmovdqa xmm8, xmm2
+ vextracti32x4 xmm0, zmm9, 2
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_1:
+ vmovdqu xmm1, [ptr_plaintext]
+ add ptr_plaintext, 16
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ vmovdqa xmm8, xmm1
+ vextracti32x4 xmm0, zmm9, 1
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+
+_start_by16:
+ ; Make first 7 tweak values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+ ; Make next 8 tweak values by all x 2^8
+ vpsrldq zmm13, zmm9, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm11, zmm9, 1
+ vpxord zmm11, zmm11, zmm14
+
+ vpsrldq zmm15, zmm10, 15
+ vpclmulqdq zmm16, zmm15, zpoly, 0
+ vpslldq zmm12, zmm10, 1
+ vpxord zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ vmovdqu8 zmm3, [ptr_plaintext+16*8]
+ vmovdqu8 zmm4, [ptr_plaintext+16*12]
+ add ptr_plaintext, 256
+
+ encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ vmovdqu8 [ptr_ciphertext+16*8], zmm3
+ vmovdqu8 [ptr_ciphertext+16*12], zmm4
+ add ptr_ciphertext, 256
+ sub N_val, 256
+
+ cmp N_val, 256
+ jge _main_loop_run_16
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block
+ jmp _do_n_blocks
+
+_start_by8:
+ ; Make first 7 tweak values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+ ; load plaintext
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 128
+
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+ ; store ciphertext
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ add ptr_ciphertext, 128
+ sub N_val, 128
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block
+ jmp _do_n_blocks
+
+_steal_cipher_next:
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+ vmovdqa xmm0, [TW]
+
+_steal_cipher:
+ ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm10, [twtempl+N_val]
+ vpshufb xmm8, xmm10
+
+ vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
+ vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm10, [twtempl]
+ vpxor xmm10, [mask1]
+ vpshufb xmm3, xmm10
+
+ vpblendvb xmm3, xmm3, xmm2, xmm10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm3, xmm0
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm8, xmm0
+
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext - 16], xmm8
+
+_ret_:
+ mov rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+ add rsp, VARIABLE_OFFSET
+%else
+ mov rsp, rbp
+ pop rbp
+%endif
+ ret
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7*16)
+ cmp tmp1, (6*16)
+ je _num_blocks_is_6
+ cmp tmp1, (5*16)
+ je _num_blocks_is_5
+ cmp tmp1, (4*16)
+ je _num_blocks_is_4
+ cmp tmp1, (3*16)
+ je _num_blocks_is_3
+ cmp tmp1, (2*16)
+ je _num_blocks_is_2
+ cmp tmp1, (1*16)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm1
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_128_enc_vaes
+no_XTS_AES_128_enc_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm
new file mode 100644
index 000000000..776525bdd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm
@@ -0,0 +1,1962 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_avx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 10101010b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%xtmp2 %8
+%define %%ptr_key2 %9
+%define %%ptr_key1 %10
+%define %%ptr_expanded_keys %11
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_avx, function
+XTS_AES_256_dec_avx:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdec xmm8, [keys + 16*10] ; round 9
+ vaesdec xmm8, [keys + 16*11] ; round 9
+ vaesdec xmm8, [keys + 16*12] ; round 9
+ vaesdec xmm8, [keys + 16*13] ; round 9
+ vaesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm
new file mode 100644
index 000000000..d52d0977e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm
@@ -0,0 +1,1896 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*14]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*13]
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*12]
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*11]
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*11]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*12]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*13]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*14]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_expanded_key_avx, function
+XTS_AES_256_dec_expanded_key_avx:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdec xmm8, [keys + 16*10] ; round 9
+ vaesdec xmm8, [keys + 16*11] ; round 9
+ vaesdec xmm8, [keys + 16*12] ; round 9
+ vaesdec xmm8, [keys + 16*13] ; round 9
+ vaesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm
new file mode 100644
index 000000000..2e77e5e80
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm
@@ -0,0 +1,1898 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*14]
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*13]
+ movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*12]
+ movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*11]
+ movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*11]
+ aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*12]
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*13]
+ aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*14]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*0]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_expanded_key_sse, function
+XTS_AES_256_dec_expanded_key_sse:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdec xmm8, [keys + 16*10] ; round 9
+ aesdec xmm8, [keys + 16*11] ; round 9
+ aesdec xmm8, [keys + 16*12] ; round 9
+ aesdec xmm8, [keys + 16*13] ; round 9
+ aesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm
new file mode 100644
index 000000000..69228c18c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_vaes.asm
@@ -0,0 +1,1808 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_vaes(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly zmm25
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*14]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*13]
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*12]
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*11]
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*11]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*12]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*13]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*14]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; Original way to generate initial tweak values and load plaintext values
+; only used for small blocks
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; Original decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values can be generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight_zmm 6
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%TW1 %3 ; tweak 1
+%define %%TW2 %4 ; tweak 2
+%define %%T0 %5 ; Temp register
+%define %%last_eight %6
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW1, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW1, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW2, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW2, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 11
+ vbroadcasti32x4 %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 12
+ vbroadcasti32x4 %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 13
+ vbroadcasti32x4 %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 14
+ vbroadcasti32x4 %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+%endmacro
+
+
+; Decrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_16_zmm 10
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+
+%define %%TW1 %5 ; tweak 1
+%define %%TW2 %6 ; tweak 2
+%define %%TW3 %7 ; tweak 3
+%define %%TW4 %8 ; tweak 4
+
+%define %%T0 %9 ; Temp register
+%define %%last_eight %10
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+ vpxorq %%ST3, %%T0
+ vpxorq %%ST4, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW3, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW3, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW4, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW4, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm15, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm17, zmm15, 1
+ vpxord zmm17, zmm17, zmm14
+%endif
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm16, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm18, zmm16, 1
+ vpxord zmm18, zmm18, zmm14
+%endif
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 11
+ vbroadcasti32x4 %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 12
+ vbroadcasti32x4 %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 13
+ vbroadcasti32x4 %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 14
+ vbroadcasti32x4 %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+ vmovdqa32 %%TW3, zmm17
+ vmovdqa32 %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_expanded_key_vaes, function
+XTS_AES_256_dec_expanded_key_vaes:
+ endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+ push rbp
+ mov rbp, rsp
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+%else
+ sub rsp, VARIABLE_OFFSET
+%endif
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+ cmp N_val, 128
+ jl _less_than_128_bytes
+
+ vpbroadcastq zpoly, ghash_poly_8b
+
+ cmp N_val, 256
+ jge _start_by16
+
+ cmp N_val, 128
+ jge _start_by8
+
+_do_n_blocks:
+ cmp N_val, 0
+ je _ret_
+
+ cmp N_val, (7*16)
+ jge _remaining_num_blocks_is_7
+
+ cmp N_val, (6*16)
+ jge _remaining_num_blocks_is_6
+
+ cmp N_val, (5*16)
+ jge _remaining_num_blocks_is_5
+
+ cmp N_val, (4*16)
+ jge _remaining_num_blocks_is_4
+
+ cmp N_val, (3*16)
+ jge _remaining_num_blocks_is_3
+
+ cmp N_val, (2*16)
+ jge _remaining_num_blocks_is_2
+
+ cmp N_val, (1*16)
+ jge _remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+ vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext - 16], xmm1
+ vmovdqa xmm8, xmm1
+
+ ; Calc previous tweak
+ mov tmp1, 1
+ kmovq k1, tmp1
+ vpsllq xmm13, xmm9, 63
+ vpsraq xmm14, xmm13, 63
+ vpandq xmm5, xmm14, XWORD(zpoly)
+ vpxorq xmm9 {k1}, xmm9, xmm5
+ vpsrldq xmm10, xmm9, 8
+ vpshrdq xmm0, xmm9, xmm10, 1
+ vpslldq xmm13, xmm13, 8
+ vpxorq xmm0, xmm0, xmm13
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_7:
+ mov tmp1, -1
+ shr tmp1, 16
+ kmovq k1, tmp1
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*7
+ and N_val, 15
+ je _done_7_remain
+ vextracti32x4 xmm12, zmm10, 2
+ vextracti32x4 xmm13, zmm10, 3
+ vinserti32x4 zmm10, xmm13, 2
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ add ptr_ciphertext, 16*7
+ vextracti32x4 xmm8, zmm2, 0x2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_7_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_6:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 ymm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*6
+ and N_val, 15
+ je _done_6_remain
+ vextracti32x4 xmm12, zmm10, 1
+ vextracti32x4 xmm13, zmm10, 2
+ vinserti32x4 zmm10, xmm13, 1
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ add ptr_ciphertext, 16*6
+ vextracti32x4 xmm8, zmm2, 0x1
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_6_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ jmp _ret_
+
+_remaining_num_blocks_is_5:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*5
+ and N_val, 15
+ je _done_5_remain
+ vmovdqa xmm12, xmm10
+ vextracti32x4 xmm10, zmm10, 1
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_5_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_4:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ add ptr_plaintext, 16*4
+ and N_val, 15
+ je _done_4_remain
+ vextracti32x4 xmm12, zmm9, 3
+ vinserti32x4 zmm9, xmm10, 3
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ add ptr_ciphertext, 16*4
+ vextracti32x4 xmm8, zmm1, 0x3
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_4_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ jmp _ret_
+
+_remaining_num_blocks_is_3:
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ add ptr_plaintext, 16*3
+ and N_val, 15
+ je _done_3_remain
+ vextracti32x4 xmm13, zmm9, 2
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+ vmovdqa xmm0, xmm13
+ jmp _steal_cipher
+_done_3_remain:
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ jmp _ret_
+
+_remaining_num_blocks_is_2:
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ add ptr_plaintext, 16*2
+ and N_val, 15
+ je _done_2_remain
+ vextracti32x4 xmm10, zmm9, 2
+ vextracti32x4 xmm12, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_2_remain:
+ vextracti32x4 xmm10, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_1:
+ vmovdqu xmm1, [ptr_plaintext]
+ add ptr_plaintext, 16
+ and N_val, 15
+ je _done_1_remain
+ vextracti32x4 xmm11, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+ vmovdqa xmm8, xmm1
+ vmovdqa xmm0, xmm9
+ jmp _steal_cipher
+_done_1_remain:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ jmp _ret_
+
+
+
+_start_by16:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+ ; Make next 8 tweek values by all x 2^8
+ vpsrldq zmm13, zmm9, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm11, zmm9, 1
+ vpxord zmm11, zmm11, zmm14
+
+ vpsrldq zmm15, zmm10, 15
+ vpclmulqdq zmm16, zmm15, zpoly, 0
+ vpslldq zmm12, zmm10, 1
+ vpxord zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ vmovdqu8 zmm3, [ptr_plaintext+16*8]
+ vmovdqu8 zmm4, [ptr_plaintext+16*12]
+ add ptr_plaintext, 256
+
+ decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ vmovdqu8 [ptr_ciphertext+16*8], zmm3
+ vmovdqu8 [ptr_ciphertext+16*12], zmm4
+ add ptr_ciphertext, 256
+ sub N_val, 256
+ cmp N_val, 256
+ jge _main_loop_run_16
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ jmp _do_n_blocks
+
+_start_by8:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 128
+
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ add ptr_ciphertext, 128
+ sub N_val, 128
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ jmp _do_n_blocks
+
+_steal_cipher:
+ ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm10, [twtempl+N_val]
+ vpshufb xmm8, xmm10
+
+ vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
+ vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm10, [twtempl]
+ vpxor xmm10, [mask1]
+ vpshufb xmm3, xmm10
+
+ vpblendvb xmm3, xmm3, xmm2, xmm10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm3, xmm0
+
+ ;decrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdec xmm8, [keys + 16*10] ; round 9
+ vaesdec xmm8, [keys + 16*11] ; round 9
+ vaesdec xmm8, [keys + 16*12] ; round 9
+ vaesdec xmm8, [keys + 16*13] ; round 9
+ vaesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm8, xmm0
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext - 16], xmm8
+
+_ret_:
+ mov rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+ add rsp, VARIABLE_OFFSET
+%else
+ mov rsp, rbp
+ pop rbp
+%endif
+ ret
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ and N_val, 15
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa64 xmm16, xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*7
+ vmovdqa64 xmm0, xmm16
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ and N_val, 15
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm15, xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm0, xmm15
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ and N_val, 15
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm14, xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm0, xmm14
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ and N_val, 15
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm13, xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm0, xmm13
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ and N_val, 15
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm12, xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm0, xmm12
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ and N_val, 15
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm11, xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm0, xmm11
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ and N_val, 15
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm0, xmm10
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_256_dec_expanded_key_vaes
+no_XTS_AES_256_dec_expanded_key_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm
new file mode 100644
index 000000000..3904c8a54
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm
@@ -0,0 +1,1963 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_sse(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%xtmp2 %8
+%define %%ptr_key2 %9
+%define %%ptr_key1 %10
+%define %%ptr_expanded_keys %11
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ movdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*12], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*10], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*8], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*6], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*4], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*2], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_sse, function
+XTS_AES_256_dec_sse:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdec xmm8, [keys + 16*10] ; round 9
+ aesdec xmm8, [keys + 16*11] ; round 9
+ aesdec xmm8, [keys + 16*12] ; round 9
+ aesdec xmm8, [keys + 16*13] ; round 9
+ aesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm
new file mode 100644
index 000000000..3e26e5c04
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_vaes.asm
@@ -0,0 +1,1875 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly zmm25
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 10101010b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%xtmp2 %8
+%define %%ptr_key2 %9
+%define %%ptr_key1 %10
+%define %%ptr_expanded_keys %11
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; Original way to generate initial tweak values and load plaintext values
+; only used for small blocks
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; Original decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values can be generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight_zmm 6
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%TW1 %3 ; tweak 1
+%define %%TW2 %4 ; tweak 2
+%define %%T0 %5 ; Temp register
+%define %%last_eight %6
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW1, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW1, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW2, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW2, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 11
+ vbroadcasti32x4 %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 12
+ vbroadcasti32x4 %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 13
+ vbroadcasti32x4 %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+
+ ; round 14
+ vbroadcasti32x4 %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+%endmacro
+
+
+; Decrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_16_zmm 10
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+
+%define %%TW1 %5 ; tweak 1
+%define %%TW2 %6 ; tweak 2
+%define %%TW3 %7 ; tweak 3
+%define %%TW4 %8 ; tweak 4
+
+%define %%T0 %9 ; Temp register
+%define %%last_eight %10
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+ vpxorq %%ST3, %%T0
+ vpxorq %%ST4, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW3, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW3, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW4, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW4, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm15, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm17, zmm15, 1
+ vpxord zmm17, zmm17, zmm14
+%endif
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm16, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm18, zmm16, 1
+ vpxord zmm18, zmm18, zmm14
+%endif
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 11
+ vbroadcasti32x4 %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 12
+ vbroadcasti32x4 %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 13
+ vbroadcasti32x4 %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+
+ ; round 14
+ vbroadcasti32x4 %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+ vmovdqa32 %%TW3, zmm17
+ vmovdqa32 %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_dec_vaes, function
+XTS_AES_256_dec_vaes:
+ endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+ push rbp
+ mov rbp, rsp
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+%else
+ sub rsp, VARIABLE_OFFSET
+%endif
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+ cmp N_val, 128
+ jl _less_than_128_bytes
+
+ vpbroadcastq zpoly, ghash_poly_8b
+
+ cmp N_val, 256
+ jge _start_by16
+
+ cmp N_val, 128
+ jge _start_by8
+
+_do_n_blocks:
+ cmp N_val, 0
+ je _ret_
+
+ cmp N_val, (7*16)
+ jge _remaining_num_blocks_is_7
+
+ cmp N_val, (6*16)
+ jge _remaining_num_blocks_is_6
+
+ cmp N_val, (5*16)
+ jge _remaining_num_blocks_is_5
+
+ cmp N_val, (4*16)
+ jge _remaining_num_blocks_is_4
+
+ cmp N_val, (3*16)
+ jge _remaining_num_blocks_is_3
+
+ cmp N_val, (2*16)
+ jge _remaining_num_blocks_is_2
+
+ cmp N_val, (1*16)
+ jge _remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+ vmovdqu xmm1, [ptr_plaintext - 16] ; Re-due last block with next tweak
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext - 16], xmm1
+ vmovdqa xmm8, xmm1
+
+ ; Calc previous tweak
+ mov tmp1, 1
+ kmovq k1, tmp1
+ vpsllq xmm13, xmm9, 63
+ vpsraq xmm14, xmm13, 63
+ vpandq xmm5, xmm14, XWORD(zpoly)
+ vpxorq xmm9 {k1}, xmm9, xmm5
+ vpsrldq xmm10, xmm9, 8
+ vpshrdq xmm0, xmm9, xmm10, 1
+ vpslldq xmm13, xmm13, 8
+ vpxorq xmm0, xmm0, xmm13
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_7:
+ mov tmp1, -1
+ shr tmp1, 16
+ kmovq k1, tmp1
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*7
+ and N_val, 15
+ je _done_7_remain
+ vextracti32x4 xmm12, zmm10, 2
+ vextracti32x4 xmm13, zmm10, 3
+ vinserti32x4 zmm10, xmm13, 2
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ add ptr_ciphertext, 16*7
+ vextracti32x4 xmm8, zmm2, 0x2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_7_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_6:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 ymm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*6
+ and N_val, 15
+ je _done_6_remain
+ vextracti32x4 xmm12, zmm10, 1
+ vextracti32x4 xmm13, zmm10, 2
+ vinserti32x4 zmm10, xmm13, 1
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ add ptr_ciphertext, 16*6
+ vextracti32x4 xmm8, zmm2, 0x1
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_6_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ jmp _ret_
+
+_remaining_num_blocks_is_5:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*5
+ and N_val, 15
+ je _done_5_remain
+ vmovdqa xmm12, xmm10
+ vextracti32x4 xmm10, zmm10, 1
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_5_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_4:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ add ptr_plaintext, 16*4
+ and N_val, 15
+ je _done_4_remain
+ vextracti32x4 xmm12, zmm9, 3
+ vinserti32x4 zmm9, xmm10, 3
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ add ptr_ciphertext, 16*4
+ vextracti32x4 xmm8, zmm1, 0x3
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_4_remain:
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ jmp _ret_
+
+_remaining_num_blocks_is_3:
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ add ptr_plaintext, 16*3
+ and N_val, 15
+ je _done_3_remain
+ vextracti32x4 xmm13, zmm9, 2
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+ vmovdqa xmm0, xmm13
+ jmp _steal_cipher
+_done_3_remain:
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, na, na, na, na, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ jmp _ret_
+
+_remaining_num_blocks_is_2:
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ add ptr_plaintext, 16*2
+ and N_val, 15
+ je _done_2_remain
+ vextracti32x4 xmm10, zmm9, 2
+ vextracti32x4 xmm12, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+ vmovdqa xmm0, xmm12
+ jmp _steal_cipher
+_done_2_remain:
+ vextracti32x4 xmm10, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, na, na, na, na, na, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ jmp _ret_
+
+_remaining_num_blocks_is_1:
+ vmovdqu xmm1, [ptr_plaintext]
+ add ptr_plaintext, 16
+ and N_val, 15
+ je _done_1_remain
+ vextracti32x4 xmm11, zmm9, 1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm11, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+ vmovdqa xmm8, xmm1
+ vmovdqa xmm0, xmm9
+ jmp _steal_cipher
+_done_1_remain:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, na, na, na, na, na, na, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ jmp _ret_
+
+
+
+_start_by16:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+ ; Make next 8 tweek values by all x 2^8
+ vpsrldq zmm13, zmm9, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm11, zmm9, 1
+ vpxord zmm11, zmm11, zmm14
+
+ vpsrldq zmm15, zmm10, 15
+ vpclmulqdq zmm16, zmm15, zpoly, 0
+ vpslldq zmm12, zmm10, 1
+ vpxord zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ vmovdqu8 zmm3, [ptr_plaintext+16*8]
+ vmovdqu8 zmm4, [ptr_plaintext+16*12]
+ add ptr_plaintext, 256
+
+ decrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ vmovdqu8 [ptr_ciphertext+16*8], zmm3
+ vmovdqu8 [ptr_ciphertext+16*12], zmm4
+ add ptr_ciphertext, 256
+ sub N_val, 256
+ cmp N_val, 256
+ jge _main_loop_run_16
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ jmp _do_n_blocks
+
+_start_by8:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 128
+
+ decrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ add ptr_ciphertext, 128
+ sub N_val, 128
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ jmp _do_n_blocks
+
+_steal_cipher:
+ ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm10, [twtempl+N_val]
+ vpshufb xmm8, xmm10
+
+ vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
+ vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm10, [twtempl]
+ vpxor xmm10, [mask1]
+ vpshufb xmm3, xmm10
+
+ vpblendvb xmm3, xmm3, xmm2, xmm10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm3, xmm0
+
+ ;decrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdec xmm8, [keys + 16*10] ; round 10
+ vaesdec xmm8, [keys + 16*11] ; round 11
+ vaesdec xmm8, [keys + 16*12] ; round 12
+ vaesdec xmm8, [keys + 16*13] ; round 13
+ vaesdeclast xmm8, [keys + 16*14] ; round 14
+
+ ; xor Tweak value
+ vpxor xmm8, xmm8, xmm0
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext - 16], xmm8
+
+_ret_:
+ mov rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+ add rsp, VARIABLE_OFFSET
+%else
+ mov rsp, rbp
+ pop rbp
+%endif
+ ret
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ and N_val, 15
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa64 xmm16, xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*7
+ vmovdqa64 xmm0, xmm16
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ and N_val, 15
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm15, xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm0, xmm15
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ and N_val, 15
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm14, xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm0, xmm14
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ and N_val, 15
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm13, xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm0, xmm13
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ and N_val, 15
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm12, xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm0, xmm12
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ and N_val, 15
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm11, xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm0, xmm11
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ and N_val, 15
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm0, xmm10
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_256_dec_vaes
+no_XTS_AES_256_dec_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm
new file mode 100644
index 000000000..0993ff909
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm
@@ -0,0 +1,1708 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 10101010b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%ptr_key2 %8
+%define %%ptr_key1 %9
+%define %%ptr_expanded_keys %10
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_avx, function
+XTS_AES_256_enc_avx:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenc xmm8, [keys + 16*10] ; round 9
+ vaesenc xmm8, [keys + 16*11] ; round 9
+ vaesenc xmm8, [keys + 16*12] ; round 9
+ vaesenc xmm8, [keys + 16*13] ; round 9
+ vaesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm
new file mode 100644
index 000000000..6db85486d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm
@@ -0,0 +1,1653 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*11]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*11]
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*12]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*12]
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*13]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*13]
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*14]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*14]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_expanded_key_avx, function
+XTS_AES_256_enc_expanded_key_avx:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenc xmm8, [keys + 16*10] ; round 9
+ vaesenc xmm8, [keys + 16*11] ; round 9
+ vaesenc xmm8, [keys + 16*12] ; round 9
+ vaesenc xmm8, [keys + 16*13] ; round 9
+ vaesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm
new file mode 100644
index 000000000..51cb31074
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm
@@ -0,0 +1,1652 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*11]
+ aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*11]
+ movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*12]
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*12]
+ movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*13]
+ aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*13]
+ movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*14]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*14]
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_expanded_key_sse, function
+XTS_AES_256_enc_expanded_key_sse:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenc xmm8, [keys + 16*10] ; round 9
+ aesenc xmm8, [keys + 16*11] ; round 9
+ aesenc xmm8, [keys + 16*12] ; round 9
+ aesenc xmm8, [keys + 16*13] ; round 9
+ aesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm
new file mode 100644
index 000000000..37a5dc792
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_vaes.asm
@@ -0,0 +1,1634 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_vaes(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly zmm25
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*11]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*11]
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*12]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*12]
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*13]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*13]
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*14]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*14]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight_zmm 6
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%TW1 %3 ; tweak 1
+%define %%TW2 %4 ; tweak 2
+%define %%T0 %5 ; Temp register
+%define %%last_eight %6
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW1, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW1, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW2, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW2, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 11
+ vbroadcasti32x4 %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 12
+ vbroadcasti32x4 %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 13
+ vbroadcasti32x4 %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 14
+ vbroadcasti32x4 %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+%endmacro
+
+
+; Encrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_16_zmm 10
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+
+%define %%TW1 %5 ; tweak 1
+%define %%TW2 %6 ; tweak 2
+%define %%TW3 %7 ; tweak 3
+%define %%TW4 %8 ; tweak 4
+
+%define %%T0 %9 ; Temp register
+%define %%last_eight %10
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+ vpxorq %%ST3, %%T0
+ vpxorq %%ST4, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW3, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW3, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW4, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW4, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm15, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm17, zmm15, 1
+ vpxord zmm17, zmm17, zmm14
+%endif
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm16, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm18, zmm16, 1
+ vpxord zmm18, zmm18, zmm14
+%endif
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 11
+ vbroadcasti32x4 %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 12
+ vbroadcasti32x4 %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 13
+ vbroadcasti32x4 %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 14
+ vbroadcasti32x4 %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+ vmovdqa32 %%TW3, zmm17
+ vmovdqa32 %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_expanded_key_vaes, function
+XTS_AES_256_enc_expanded_key_vaes:
+ endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+ push rbp
+ mov rbp, rsp
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+%else
+ sub rsp, VARIABLE_OFFSET
+%endif
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+ cmp N_val, 128
+ jl _less_than_128_bytes
+
+ vpbroadcastq zpoly, ghash_poly_8b
+
+ cmp N_val, 256
+ jge _start_by16
+
+ cmp N_val, 128
+ jge _start_by8
+
+_do_n_blocks:
+ cmp N_val, 0
+ je _ret_
+
+ cmp N_val, (7*16)
+ jge _remaining_num_blocks_is_7
+
+ cmp N_val, (6*16)
+ jge _remaining_num_blocks_is_6
+
+ cmp N_val, (5*16)
+ jge _remaining_num_blocks_is_5
+
+ cmp N_val, (4*16)
+ jge _remaining_num_blocks_is_4
+
+ cmp N_val, (3*16)
+ jge _remaining_num_blocks_is_3
+
+ cmp N_val, (2*16)
+ jge _remaining_num_blocks_is_2
+
+ cmp N_val, (1*16)
+ jge _remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+ vmovdqa xmm8, xmm0
+ vmovdqa xmm0, xmm9
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_7:
+ mov tmp1, -1
+ shr tmp1, 16
+ kmovq k1, tmp1
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*7
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ add ptr_ciphertext, 16*7
+
+ vextracti32x4 xmm8, zmm2, 0x2
+ vextracti32x4 xmm0, zmm10, 0x3
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_6:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 ymm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*6
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ add ptr_ciphertext, 16*6
+
+ vextracti32x4 xmm8, zmm2, 0x1
+ vextracti32x4 xmm0, zmm10, 0x2
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_5:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*5
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ add ptr_ciphertext, 16*5
+
+ movdqa xmm8, xmm2
+ vextracti32x4 xmm0, zmm10, 0x1
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_4:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ add ptr_plaintext, 16*4
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ add ptr_ciphertext, 16*4
+
+ vextracti32x4 xmm8, zmm1, 0x3
+ vextracti32x4 xmm0, zmm10, 0x0
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_3:
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 2
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ vmovdqa xmm8, xmm3
+ vextracti32x4 xmm0, zmm9, 3
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_2:
+ vextracti32x4 xmm10, zmm9, 1
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*2
+
+ vmovdqa xmm8, xmm2
+ vextracti32x4 xmm0, zmm9, 2
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_1:
+ vmovdqu xmm1, [ptr_plaintext]
+ add ptr_plaintext, 16
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ vmovdqa xmm8, xmm1
+ vextracti32x4 xmm0, zmm9, 1
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+
+_start_by16:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+ ; Make next 8 tweek values by all x 2^8
+ vpsrldq zmm13, zmm9, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm11, zmm9, 1
+ vpxord zmm11, zmm11, zmm14
+
+ vpsrldq zmm15, zmm10, 15
+ vpclmulqdq zmm16, zmm15, zpoly, 0
+ vpslldq zmm12, zmm10, 1
+ vpxord zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ vmovdqu8 zmm3, [ptr_plaintext+16*8]
+ vmovdqu8 zmm4, [ptr_plaintext+16*12]
+ add ptr_plaintext, 256
+
+ encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ vmovdqu8 [ptr_ciphertext+16*8], zmm3
+ vmovdqu8 [ptr_ciphertext+16*12], zmm4
+ add ptr_ciphertext, 256
+ sub N_val, 256
+
+ cmp N_val, 256
+ jge _main_loop_run_16
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block
+ jmp _do_n_blocks
+
+_start_by8:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+ ; load plaintext
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 128
+
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+ ; store ciphertext
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ add ptr_ciphertext, 128
+ sub N_val, 128
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block
+ jmp _do_n_blocks
+
+_steal_cipher_next:
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+ vmovdqa xmm0, [TW]
+
+_steal_cipher:
+ ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm10, [twtempl+N_val]
+ vpshufb xmm8, xmm10
+
+ vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
+ vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm10, [twtempl]
+ vpxor xmm10, [mask1]
+ vpshufb xmm3, xmm10
+
+ vpblendvb xmm3, xmm3, xmm2, xmm10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm3, xmm0
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenc xmm8, [keys + 16*10] ; round 9
+ vaesenc xmm8, [keys + 16*11] ; round 9
+ vaesenc xmm8, [keys + 16*12] ; round 9
+ vaesenc xmm8, [keys + 16*13] ; round 9
+ vaesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm8, xmm0
+
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext - 16], xmm8
+
+_ret_:
+ mov rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+ add rsp, VARIABLE_OFFSET
+%else
+ mov rsp, rbp
+ pop rbp
+%endif
+ ret
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm1
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_256_enc_expanded_key_vaes
+no_XTS_AES_256_enc_expanded_key_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm
new file mode 100644
index 000000000..5b805b74d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm
@@ -0,0 +1,1708 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_sse(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%ptr_key2 %8
+%define %%ptr_key1 %9
+%define %%ptr_expanded_keys %10
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ movdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*12], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_sse, function
+XTS_AES_256_enc_sse:
+ endbranch
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenc xmm8, [keys + 16*10] ; round 9
+ aesenc xmm8, [keys + 16*11] ; round 9
+ aesenc xmm8, [keys + 16*12] ; round 9
+ aesenc xmm8, [keys + 16*13] ; round 9
+ aesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm
new file mode 100644
index 000000000..f75497ece
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_vaes.asm
@@ -0,0 +1,1687 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+%define zpoly zmm25
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 10101010b
+ vshufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ vshufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%ptr_key2 %8
+%define %%ptr_key1 %9
+%define %%ptr_expanded_keys %10
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight_zmm 6
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%TW1 %3 ; tweak 1
+%define %%TW2 %4 ; tweak 2
+%define %%T0 %5 ; Temp register
+%define %%last_eight %6
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW1, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW1, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW2, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW2, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 11
+ vbroadcasti32x4 %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 12
+ vbroadcasti32x4 %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 13
+ vbroadcasti32x4 %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+
+ ; round 14
+ vbroadcasti32x4 %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+%endmacro
+
+
+; Encrypt 16 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_16_zmm 10
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+
+%define %%TW1 %5 ; tweak 1
+%define %%TW2 %6 ; tweak 2
+%define %%TW3 %7 ; tweak 3
+%define %%TW4 %8 ; tweak 4
+
+%define %%T0 %9 ; Temp register
+%define %%last_eight %10
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; ARK
+ vbroadcasti32x4 %%T0, [keys]
+ vpxorq %%ST1, %%T0
+ vpxorq %%ST2, %%T0
+ vpxorq %%ST3, %%T0
+ vpxorq %%ST4, %%T0
+
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW3, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm15, %%TW3, 1
+ vpxord zmm15, zmm15, zmm14
+%endif
+ ; round 1
+ vbroadcasti32x4 %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 2
+ vbroadcasti32x4 %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 3
+ vbroadcasti32x4 %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, %%TW4, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm16, %%TW4, 1
+ vpxord zmm16, zmm16, zmm14
+%endif
+ ; round 4
+ vbroadcasti32x4 %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 5
+ vbroadcasti32x4 %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 6
+ vbroadcasti32x4 %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm15, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm17, zmm15, 1
+ vpxord zmm17, zmm17, zmm14
+%endif
+ ; round 7
+ vbroadcasti32x4 %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 8
+ vbroadcasti32x4 %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 9
+ vbroadcasti32x4 %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+%if (0 == %%last_eight)
+ vpsrldq zmm13, zmm16, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm18, zmm16, 1
+ vpxord zmm18, zmm18, zmm14
+%endif
+ ; round 10
+ vbroadcasti32x4 %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 11
+ vbroadcasti32x4 %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 12
+ vbroadcasti32x4 %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 13
+ vbroadcasti32x4 %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+
+ ; round 14
+ vbroadcasti32x4 %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+
+ ; xor Tweak values
+ vpxorq %%ST1, %%TW1
+ vpxorq %%ST2, %%TW2
+ vpxorq %%ST3, %%TW3
+ vpxorq %%ST4, %%TW4
+
+ ; load next Tweak values
+ vmovdqa32 %%TW1, zmm15
+ vmovdqa32 %%TW2, zmm16
+ vmovdqa32 %%TW3, zmm17
+ vmovdqa32 %%TW4, zmm18
+%endmacro
+
+
+section .text
+
+mk_global XTS_AES_256_enc_vaes, function
+XTS_AES_256_enc_vaes:
+ endbranch
+
+%define ALIGN_STACK
+%ifdef ALIGN_STACK
+ push rbp
+ mov rbp, rsp
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+%else
+ sub rsp, VARIABLE_OFFSET
+%endif
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+ cmp N_val, 128
+ jl _less_than_128_bytes
+
+ vpbroadcastq zpoly, ghash_poly_8b
+
+ cmp N_val, 256
+ jge _start_by16
+
+ cmp N_val, 128
+ jge _start_by8
+
+_do_n_blocks:
+ cmp N_val, 0
+ je _ret_
+
+ cmp N_val, (7*16)
+ jge _remaining_num_blocks_is_7
+
+ cmp N_val, (6*16)
+ jge _remaining_num_blocks_is_6
+
+ cmp N_val, (5*16)
+ jge _remaining_num_blocks_is_5
+
+ cmp N_val, (4*16)
+ jge _remaining_num_blocks_is_4
+
+ cmp N_val, (3*16)
+ jge _remaining_num_blocks_is_3
+
+ cmp N_val, (2*16)
+ jge _remaining_num_blocks_is_2
+
+ cmp N_val, (1*16)
+ jge _remaining_num_blocks_is_1
+
+;; _remaining_num_blocks_is_0:
+ vmovdqa xmm8, xmm0
+ vmovdqa xmm0, xmm9
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_7:
+ mov tmp1, -1
+ shr tmp1, 16
+ kmovq k1, tmp1
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2 {k1}, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*7
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4] {k1}, zmm2
+ add ptr_ciphertext, 16*7
+
+ vextracti32x4 xmm8, zmm2, 0x2
+ vextracti32x4 xmm0, zmm10, 0x3
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_6:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 ymm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*6
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], ymm2
+ add ptr_ciphertext, 16*6
+
+ vextracti32x4 xmm8, zmm2, 0x1
+ vextracti32x4 xmm0, zmm10, 0x2
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_5:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 16*5
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu [ptr_ciphertext+16*4], xmm2
+ add ptr_ciphertext, 16*5
+
+ movdqa xmm8, xmm2
+ vextracti32x4 xmm0, zmm10, 0x1
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_4:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ add ptr_plaintext, 16*4
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 1
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ add ptr_ciphertext, 16*4
+
+ vextracti32x4 xmm8, zmm1, 0x3
+ vextracti32x4 xmm0, zmm10, 0x0
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_3:
+ vextracti32x4 xmm10, zmm9, 1
+ vextracti32x4 xmm11, zmm9, 2
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ vmovdqa xmm8, xmm3
+ vextracti32x4 xmm0, zmm9, 3
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_2:
+ vextracti32x4 xmm10, zmm9, 1
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ add ptr_ciphertext, 16*2
+
+ vmovdqa xmm8, xmm2
+ vextracti32x4 xmm0, zmm9, 2
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+_remaining_num_blocks_is_1:
+ vmovdqu xmm1, [ptr_plaintext]
+ add ptr_plaintext, 16
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ vmovdqa xmm8, xmm1
+ vextracti32x4 xmm0, zmm9, 1
+ and N_val, 15
+ je _ret_
+ jmp _steal_cipher
+
+
+_start_by16:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+ ; Make next 8 tweek values by all x 2^8
+ vpsrldq zmm13, zmm9, 15
+ vpclmulqdq zmm14, zmm13, zpoly, 0
+ vpslldq zmm11, zmm9, 1
+ vpxord zmm11, zmm11, zmm14
+
+ vpsrldq zmm15, zmm10, 15
+ vpclmulqdq zmm16, zmm15, zpoly, 0
+ vpslldq zmm12, zmm10, 1
+ vpxord zmm12, zmm12, zmm16
+
+_main_loop_run_16:
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ vmovdqu8 zmm3, [ptr_plaintext+16*8]
+ vmovdqu8 zmm4, [ptr_plaintext+16*12]
+ add ptr_plaintext, 256
+
+ encrypt_by_16_zmm zmm1, zmm2, zmm3, zmm4, zmm9, zmm10, zmm11, zmm12, zmm0, 0
+
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ vmovdqu8 [ptr_ciphertext+16*8], zmm3
+ vmovdqu8 [ptr_ciphertext+16*12], zmm4
+ add ptr_ciphertext, 256
+ sub N_val, 256
+
+ cmp N_val, 256
+ jge _main_loop_run_16
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ vextracti32x4 xmm0, zmm4, 0x3 ; keep last crypted block
+ jmp _do_n_blocks
+
+_start_by8:
+ ; Make first 7 tweek values
+ vbroadcasti32x4 zmm0, [TW]
+ vbroadcasti32x4 zmm8, [shufb_15_7]
+ mov tmp1, 0xaa
+ kmovq k2, tmp1
+
+ ; Mult tweak by 2^{3, 2, 1, 0}
+ vpshufb zmm1, zmm0, zmm8 ; mov 15->0, 7->8
+ vpsllvq zmm4, zmm0, [const_dq3210] ; shift l 3,2,1,0
+ vpsrlvq zmm2, zmm1, [const_dq5678] ; shift r 5,6,7,8
+ vpclmulqdq zmm3, zmm2, zpoly, 0x00
+ vpxorq zmm4 {k2}, zmm4, zmm2 ; tweaks shifted by 3-0
+ vpxord zmm9, zmm3, zmm4
+
+ ; Mult tweak by 2^{7, 6, 5, 4}
+ vpsllvq zmm5, zmm0, [const_dq7654] ; shift l 7,6,5,4
+ vpsrlvq zmm6, zmm1, [const_dq1234] ; shift r 1,2,3,4
+ vpclmulqdq zmm7, zmm6, zpoly, 0x00
+ vpxorq zmm5 {k2}, zmm5, zmm6 ; tweaks shifted by 7-4
+ vpxord zmm10, zmm7, zmm5
+
+_main_loop_run_8:
+ ; load plaintext
+ vmovdqu8 zmm1, [ptr_plaintext+16*0]
+ vmovdqu8 zmm2, [ptr_plaintext+16*4]
+ add ptr_plaintext, 128
+
+ encrypt_by_eight_zmm zmm1, zmm2, zmm9, zmm10, zmm0, 0
+
+ ; store ciphertext
+ vmovdqu8 [ptr_ciphertext+16*0], zmm1
+ vmovdqu8 [ptr_ciphertext+16*4], zmm2
+ add ptr_ciphertext, 128
+ sub N_val, 128
+
+ cmp N_val, 128
+ jge _main_loop_run_8
+
+ vextracti32x4 xmm0, zmm2, 0x3 ; keep last crypted block
+ jmp _do_n_blocks
+
+_steal_cipher_next:
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+ vmovdqa xmm0, [TW]
+
+_steal_cipher:
+ ; start cipher stealing simplified: xmm8 - last cipher block, xmm0 - next tweak
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm10, [twtempl+N_val]
+ vpshufb xmm8, xmm10
+
+ vmovdqu xmm3, [ptr_plaintext - 16 + N_val]
+ vmovdqu [ptr_ciphertext - 16 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm10, [twtempl]
+ vpxor xmm10, [mask1]
+ vpshufb xmm3, xmm10
+
+ vpblendvb xmm3, xmm3, xmm2, xmm10
+
+ ; xor Tweak value
+ vpxor xmm8, xmm3, xmm0
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenc xmm8, [keys + 16*10] ; round 10
+ vaesenc xmm8, [keys + 16*11] ; round 11
+ vaesenc xmm8, [keys + 16*12] ; round 12
+ vaesenc xmm8, [keys + 16*13] ; round 13
+ vaesenclast xmm8, [keys + 16*14] ; round 14
+
+ ; xor Tweak value
+ vpxor xmm8, xmm8, xmm0
+
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext - 16], xmm8
+
+_ret_:
+ mov rbx, [_gpr + 8*0]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+%ifndef ALIGN_STACK
+ add rsp, VARIABLE_OFFSET
+%else
+ mov rsp, rbp
+ pop rbp
+%endif
+ ret
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ add ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ add ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ add ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ add ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+
+ add ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm1
+ and N_val, 15 ; N_val = N_val mod 16
+ je _ret_
+ jmp _steal_cipher_next
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+const_dq3210: dq 0, 0, 1, 1, 2, 2, 3, 3
+const_dq5678: dq 8, 8, 7, 7, 6, 6, 5, 5
+const_dq7654: dq 4, 4, 5, 5, 6, 6, 7, 7
+const_dq1234: dq 4, 4, 3, 3, 2, 2, 1, 1
+
+shufb_15_7: db 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_XTS_AES_256_enc_vaes
+no_XTS_AES_256_enc_vaes:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S
new file mode 100644
index 000000000..7214f0f25
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_128.S
@@ -0,0 +1,215 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "gcm_common_128.S"
+/*
+ void gist_aes_gcm_enc_finalize_##mode( \
+ const struct gcm_key_data *key_data, \
+ struct gcm_context_data *context, \
+ uint8_t *auth_tag, \
+ uint64_t auth_tag_len \
+ )
+*/
+ declare_var_generic_reg key_data ,0
+ declare_var_generic_reg context ,1
+ declare_var_generic_reg auth_tag ,2
+ declare_var_generic_reg auth_tag_len ,3
+ declare_var_generic_reg partial_block_len ,4
+ declare_var_generic_reg partial_block ,1
+
+ declare_var_generic_reg hashkey_addr ,0
+ declare_var_generic_reg temp0, 6
+
+ declare_var_vector_reg OrigIV ,0
+ declare_var_vector_reg AadHash ,1
+ declare_var_vector_reg HashKey0 ,2
+ declare_var_vector_reg HashKey0Ext ,3
+ declare_var_vector_reg High ,4
+ declare_var_vector_reg Low ,5
+ declare_var_vector_reg Middle0 ,6
+ declare_var_vector_reg Len ,7
+ declare_var_vector_reg Tmp0 ,8
+ declare_var_vector_reg Tmp1 ,9
+ declare_var_vector_reg Zero ,10
+ declare_var_vector_reg Poly ,11
+ declare_var_vector_reg PartitialBlock ,13
+
+ declare_var_vector_reg Tmp2 ,31
+ declare_var_vector_reg Tmp3 ,12
+
+ .set stack_size,48
+ .macro push_stack
+ stp d8, d9,[sp,-stack_size]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ .endm
+
+ .macro pop_stack
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d8, d9, [sp], stack_size
+ .endm
+START_FUNC(enc,KEY_LEN,_finalize_)
+START_FUNC(dec,KEY_LEN,_finalize_)
+ ldr partial_block_len,[context,PARTIAL_BLOCK_LENGTH_OFF]
+ load_aes_keys key_data
+ push_stack
+ /* Init Consts for ghash */
+ movi vZero.4s,0
+ mov temp0,0x87
+ dup vPoly.2d,temp0
+ ldr qOrigIV,[context,ORIG_IV_OFF] /* OrigIV */
+ ldp qAadHash,qLen,[context],PARTIAL_BLOCK_ENC_KEY_OFF /* Len , context move to partial block*/
+ /* Init Consts for ghash */
+ movi vZero.4s,0
+ mov temp0,0x87
+ dup vPoly.2d,temp0
+ /* complete part */
+ cbnz partial_block_len,10f
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32]
+ aes_encrypt_round OrigIV,Key0
+ pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round OrigIV,Key1
+ pmull vLow.1q ,vAadHash.1d,vHashKey0.1d
+ shl vLen.2d,vLen.2d,3 /* Len */
+ aes_encrypt_round OrigIV,Key2
+ pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+ rev64 vLen.16b,vLen.16b /* Len */
+ aes_encrypt_round OrigIV,Key3
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d
+ rbit vAadHash.16b,vLen.16b /* Len */
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32]
+ aes_encrypt_round OrigIV,Key4
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ aes_encrypt_round OrigIV,Key5
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round OrigIV,Key6
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+ aes_encrypt_round OrigIV,Key7
+ eor vHigh.16b,vHigh.16b,vTmp0.16b
+ eor vLow.16b ,vLow.16b ,vTmp1.16b
+ pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d
+ aes_encrypt_round OrigIV,Key8
+ pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d
+ aese vOrigIV.16b,vKey9.16b
+ eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b
+ eor vOrigIV.16b,vOrigIV.16b,vKey10.16b
+ rbit vAadHash.16b,vOrigIV.16b
+ eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b
+ ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+ rbit vAadHash.16b,vAadHash.16b /* Aad */
+ /* output auth_tag */
+ cmp auth_tag_len,16
+ bne 1f
+ /* most likely auth_tag_len=16 */
+ str qAadHash,[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=12 */
+ cmp auth_tag_len,12
+ bne 1f
+ str dAadHash,[auth_tag],8
+ st1 {vAadHash.s}[2],[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=8 */
+ str dAadHash,[auth_tag]
+ pop_stack
+ ret
+
+10: /* cbnz partial_block_len,10f */
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-3)*32]
+ aes_encrypt_round OrigIV,Key0
+ read_small_data_start PartitialBlock,partial_block,partial_block_len,temp0,Tmp0
+ pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round OrigIV,Key1
+ pmull vLow.1q ,vAadHash.1d,vHashKey0.1d
+ aes_encrypt_round OrigIV,Key2
+ pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+ aes_encrypt_round OrigIV,Key3
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d
+ aes_encrypt_round OrigIV,Key4
+ rbit vAadHash.16b,vPartitialBlock.16b
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32]
+ aes_encrypt_round OrigIV,Key5
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round OrigIV,Key6
+ shl vLen.2d,vLen.2d,3 /* Len */
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+ eor vHigh.16b,vHigh.16b,vTmp0.16b
+ aes_encrypt_round OrigIV,Key7
+ eor vLow.16b,vLow.16b,vTmp1.16b
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d
+ rev64 vLen.16b,vLen.16b /* Len */
+ aes_encrypt_round OrigIV,Key8
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ aese vOrigIV.16b,vKey9.16b
+ pmull vTmp0.1q,vAadHash.1d,vHashKey0Ext.1d
+ rbit vAadHash.16b,vLen.16b /* Len */
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32]
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ eor vOrigIV.16b,vOrigIV.16b,vKey10.16b
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+ eor vHigh.16b,vHigh.16b,vTmp0.16b
+ eor vLow.16b ,vLow.16b ,vTmp1.16b
+ pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d
+ pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d
+ eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b
+ eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b
+ rbit vAadHash.16b,vOrigIV.16b
+ ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+ rbit vAadHash.16b,vAadHash.16b /* Aad */
+ /* output auth_tag */
+ cmp auth_tag_len,16
+ bne 1f
+ /* most likely auth_tag_len=16 */
+ str qAadHash,[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=12 */
+ cmp auth_tag_len,12
+ bne 1f
+ str dAadHash,[auth_tag],8
+ st1 {vAadHash.s}[2],[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=8 */
+ str dAadHash,[auth_tag]
+ pop_stack
+ ret
+
+END_FUNC(enc,KEY_LEN,_finalize_)
+END_FUNC(dec,KEY_LEN,_finalize_)
+
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S
new file mode 100644
index 000000000..9eda7178e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_finalize_256.S
@@ -0,0 +1,220 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "gcm_common_256.S"
+/*
+ void gist_aes_gcm_enc_finalize_##mode( \
+ const struct gcm_key_data *key_data, \
+ struct gcm_context_data *context, \
+ uint8_t *auth_tag, \
+ uint64_t auth_tag_len \
+ )
+*/
+ declare_var_generic_reg key_data ,0
+ declare_var_generic_reg context ,1
+ declare_var_generic_reg auth_tag ,2
+ declare_var_generic_reg auth_tag_len ,3
+ declare_var_generic_reg partial_block_len ,4
+ declare_var_generic_reg partial_block ,1
+
+ declare_var_generic_reg hashkey_addr ,0
+ declare_var_generic_reg temp0 ,6
+
+ declare_var_vector_reg OrigIV ,0
+ declare_var_vector_reg AadHash ,1
+ declare_var_vector_reg HashKey0 ,2
+ declare_var_vector_reg HashKey0Ext ,3
+ declare_var_vector_reg High ,4
+ declare_var_vector_reg Low ,5
+ declare_var_vector_reg Middle0 ,6
+ declare_var_vector_reg Len ,7
+ declare_var_vector_reg Tmp0 ,8
+ declare_var_vector_reg Tmp1 ,9
+ declare_var_vector_reg Zero ,10
+ declare_var_vector_reg Poly ,11
+ declare_var_vector_reg PartitialBlock ,13
+
+ declare_var_vector_reg Tmp2 ,31
+ declare_var_vector_reg Tmp3 ,12
+
+ .set stack_size,48
+ .macro push_stack
+ stp d8, d9,[sp,-stack_size]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ .endm
+ .macro pop_stack
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d8, d9, [sp], stack_size
+ .endm
+
+START_FUNC(enc,KEY_LEN,_finalize_)
+START_FUNC(dec,KEY_LEN,_finalize_)
+ ldr partial_block_len,[context,PARTIAL_BLOCK_LENGTH_OFF]
+ load_aes_keys key_data
+ push_stack
+
+ ldr qOrigIV,[context,ORIG_IV_OFF] /* OrigIV */
+ ldp qAadHash,qLen,[context],PARTIAL_BLOCK_ENC_KEY_OFF /* Len , context move to partial block*/
+ /* Init Consts for ghash */
+ movi vZero.4s,0
+ mov temp0,0x87
+ dup vPoly.2d,temp0
+ /* complete part */
+ cbnz partial_block_len,10f
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32]
+ aes_encrypt_round OrigIV,Key0
+ pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round OrigIV,Key1
+ pmull vLow.1q ,vAadHash.1d,vHashKey0.1d
+ shl vLen.2d,vLen.2d,3 /* Len */
+ aes_encrypt_round OrigIV,Key2
+ pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+ rev64 vLen.16b,vLen.16b /* Len */
+ aes_encrypt_round OrigIV,Key3
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d
+ rbit vAadHash.16b,vLen.16b /* Len */
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32]
+ aes_encrypt_round OrigIV,Key4
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ aes_encrypt_round OrigIV,Key5
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round OrigIV,Key6
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+ aes_encrypt_round OrigIV,Key7
+ eor vHigh.16b,vHigh.16b,vTmp0.16b
+ eor vLow.16b ,vLow.16b ,vTmp1.16b
+ pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d
+ aes_encrypt_round OrigIV,Key8
+ pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d
+ aes_encrypt_round OrigIV,Key9
+ aes_encrypt_round OrigIV,Key10
+ aes_encrypt_round OrigIV,Key11
+ aes_encrypt_round OrigIV,Key12
+ aese vOrigIV.16b,vKey13.16b
+ eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b
+ eor vOrigIV.16b,vOrigIV.16b,vKey14.16b
+ rbit vAadHash.16b,vOrigIV.16b
+ eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b
+ ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+ rbit vAadHash.16b,vAadHash.16b /* Aad */
+ /* output auth_tag */
+ cmp auth_tag_len,16
+ bne 1f
+ /* most likely auth_tag_len=16 */
+ str qAadHash,[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=12 */
+ cmp auth_tag_len,12
+ bne 1f
+ str dAadHash,[auth_tag],8
+ st1 {vAadHash.s}[2],[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=8 */
+ str dAadHash,[auth_tag]
+ pop_stack
+ ret
+
+10: /* cbnz partial_block_len,10f */
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-3)*32]
+ aes_encrypt_round OrigIV,Key0
+ read_small_data_start PartitialBlock,partial_block,partial_block_len,temp0,Tmp0
+ pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round OrigIV,Key1
+ pmull vLow.1q ,vAadHash.1d,vHashKey0.1d
+ aes_encrypt_round OrigIV,Key2
+ pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+ aes_encrypt_round OrigIV,Key3
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d
+ aes_encrypt_round OrigIV,Key4
+ rbit vAadHash.16b,vPartitialBlock.16b
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-2)*32]
+ aes_encrypt_round OrigIV,Key5
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round OrigIV,Key6
+ shl vLen.2d,vLen.2d,3 /* Len */
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+ eor vHigh.16b,vHigh.16b,vTmp0.16b
+ aes_encrypt_round OrigIV,Key7
+ eor vLow.16b,vLow.16b,vTmp1.16b
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d
+ rev64 vLen.16b,vLen.16b /* Len */
+ aes_encrypt_round OrigIV,Key8
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ pmull vTmp0.1q,vAadHash.1d,vHashKey0Ext.1d
+ aes_encrypt_round OrigIV,Key9
+ rbit vAadHash.16b,vLen.16b /* Len */
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr,(HASHKEY_TOTAL_NUM-1)*32]
+ aes_encrypt_round OrigIV,Key10
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ aes_encrypt_round OrigIV,Key11
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round OrigIV,Key12
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d
+ aese vOrigIV.16b,vKey13.16b
+ eor vHigh.16b,vHigh.16b,vTmp0.16b
+ eor vLow.16b ,vLow.16b ,vTmp1.16b
+ pmull2 vTmp2.1q ,vAadHash.2d,vHashKey0Ext.2d
+ pmull vTmp3.1q ,vAadHash.1d,vHashKey0Ext.1d
+ eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b
+ eor vOrigIV.16b,vOrigIV.16b,vKey14.16b
+ eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b
+ rbit vAadHash.16b,vOrigIV.16b
+ ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+ rbit vAadHash.16b,vAadHash.16b /* Aad */
+ /* output auth_tag */
+ cmp auth_tag_len,16
+ bne 1f
+ /* most likely auth_tag_len=16 */
+ str qAadHash,[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=12 */
+ cmp auth_tag_len,12
+ bne 1f
+ str dAadHash,[auth_tag],8
+ st1 {vAadHash.s}[2],[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=8 */
+ str dAadHash,[auth_tag]
+ pop_stack
+ ret
+
+END_FUNC(enc,KEY_LEN,_finalize_)
+END_FUNC(dec,KEY_LEN,_finalize_)
+
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S
new file mode 100644
index 000000000..0dd94c6b7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_aes_init.S
@@ -0,0 +1,161 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "gcm_common.S"
+/*
+void gist_aes_gcm_init_##mode(
+ const struct gcm_key_data *key_data,
+ struct gcm_context_data *context,
+ uint8_t *iv,
+ uint8_t const *aad,
+ uint64_t aad_len
+ );
+*/
+ key_data .req x0
+ context .req x1
+ iv .req x2
+ aad .req x3
+ aad_len .req x4
+ temp0 .req x7
+ wtemp0 .req w7
+ temp1 .req x6
+ left_len .req x5
+ aad_left .req x2
+ small_tbl_adr .req x6
+
+ hashkey_base .req x0
+ hashkey_addr .req x2
+
+ declare_var_vector_reg AadHash,0
+ declare_var_vector_reg Dat0,1
+ declare_var_vector_reg HashKey0,2
+ declare_var_vector_reg HashKey0Ext,3
+ declare_var_vector_reg High,4
+ declare_var_vector_reg Middle0,5
+ declare_var_vector_reg Low,6
+ declare_var_vector_reg LeftDat,7
+ declare_var_vector_reg Zero,16
+ declare_var_vector_reg Poly,17
+
+ declare_var_vector_reg Tmp0,18
+ declare_var_vector_reg Tmp1,19
+ declare_var_vector_reg Ctr,1
+
+
+START_FUNC(init,128,_)
+START_FUNC(init,192,_)
+START_FUNC(init,256,_)
+ stp aad_len,xzr,[context,AAD_LEN_OFF] //save in_length and aad_length
+ str xzr,[context,PARTIAL_BLOCK_LENGTH_OFF] //clear partial_block_length
+ add hashkey_base,key_data,HASHKEY_BASE_OFF
+ /* Init Consts for ghash */
+ movi vZero.4s,0
+ mov temp0,0x87
+ dup vPoly.2d,temp0
+ /* Set orig_IV */
+ ldr wtemp0,[iv,8]
+ ldr temp1,[iv]
+ movk temp0,0x100,lsl 48
+ stp temp1,temp0,[context,ORIG_IV_OFF]
+ and left_len,aad_len,15
+ ldp qHashKey0,qHashKey0Ext,[key_data,(HASHKEY_TOTAL_NUM-1)*32]
+ /* Set current_counter, save as cpu order */
+ ldr qCtr,[context,ORIG_IV_OFF]
+ rev32 vCtr.16b,vCtr.16b
+ str qCtr,[context,CTR_OFF]
+ cbz aad_len,init_zero_exit
+ lsr aad_len,aad_len,4
+ /* Read small data */
+ cbz left_len,2f
+ add aad_left,aad,aad_len,lsl 4
+ read_small_data_start LeftDat,aad_left,left_len,small_tbl_adr,Tmp0
+ cbz aad_len,24f // aad_len less than 16
+2:
+ cbnz left_len,1f
+ /*left_len == 0 && aad_len !=0 */
+
+ sub aad_len,aad_len,1
+ /* leftDat = aad[-1] */
+ ldr qLeftDat,[aad,aad_len,lsl 4]
+ cbz aad_len,24f /* aad_len == 16 */
+1:
+ /* aad_len > 16 */
+ ldr qAadHash,[aad],16
+ rbit vAadHash.16b,vAadHash.16b
+ sub aad_len,aad_len,1
+1:
+ /* loop ghash_block */
+ cmp aad_len,HASHKEY_TOTAL_NUM - 1
+ bls 1f /* break loop */
+ sub aad_len,aad_len,HASHKEY_TOTAL_NUM
+ ghash_block_n HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base, \
+ HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \
+ Tmp0,Tmp1
+ b 1b /* back to loop start */
+1:
+ cbz aad_len,23f /* left aad_len == 0 */
+ mov temp0,HASHKEY_TOTAL_NUM - 1
+ sub temp0,temp0,aad_len
+ add hashkey_addr,hashkey_base,temp0,lsl 5
+ sub aad_len,aad_len,1
+
+
+ ghash_mult_init_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Middle0,Tmp0,Dat0,2 /* load next hash */
+1:
+ cbz aad_len,1f
+ ghash_mult_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Middle0,Tmp0,Tmp1,Dat0, 2
+
+ sub aad_len,aad_len,1
+ b 1b
+1:
+ ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1
+ rbit vAadHash.16b, vLeftDat.16b
+ ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+ str qAadHash,[context]
+ ret
+
+23:
+ ghash_block_reg AadHash,LeftDat, \
+ HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \
+ Tmp0
+ str qAadHash,[context]
+ ret
+24: /* less or equal than 16 */
+ rbit vLeftDat.16b, vLeftDat.16b
+ str qLeftDat,[context]
+ ret
+init_zero_exit:
+ stp xzr,xzr,[context]
+ ret
+END_FUNC(init,128,_)
+END_FUNC(init,192,_)
+END_FUNC(init,256,_)
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S
new file mode 100644
index 000000000..c4e8ef59c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_consts.S
@@ -0,0 +1,140 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a
+ .section .rodata
+#define CONST_VAR_START(a) \
+ .align 3;.global a;.type a, %object;a
+
+#define CONST_VAR_END(a) \
+ .size a,. - a
+CONST_VAR_START(shift_small_data_table):
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+CONST_VAR_START(read_small_data_table):
+ .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+CONST_VAR_END(shift_small_data_table)
+ .byte 0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x08,0x09,0x0a,0x0b,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e,0xff,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff
+CONST_VAR_START(write_small_data_table):
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff
+CONST_VAR_END(read_small_data_table)
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0xff,0xff,0x04,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0x08,0x09,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0xff,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0xff,0xff,0x0c,0xff
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff
+CONST_VAR_START(read_end_small_data_table):
+ .byte 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff
+CONST_VAR_END(write_small_data_table)
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0e
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0e
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d
+ .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x0c,0x0d,0x0e
+ .byte 0xff,0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b
+ .byte 0xff,0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0e
+ .byte 0xff,0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d
+CONST_VAR_START(write_end_small_data_table):
+ .byte 0xff,0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e
+CONST_VAR_END(read_end_small_data_table)
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0x0f,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff
+ .byte 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0x0f,0xff
+ .byte 0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0xff,0xff,0xff,0xff,0x0e,0x0f,0xff,0xff
+ .byte 0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f,0xff
+ .byte 0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff,0xff,0xff
+ .byte 0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0xff,0xff,0x0f,0xff
+ .byte 0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff,0xff
+CONST_VAR_START(tbx_end_small_data_table):
+ .byte 0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff
+CONST_VAR_END(write_end_small_data_table)
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+CONST_VAR_START(tbx_start_small_data_table):
+ .byte 0xff,0xff,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+CONST_VAR_END(tbx_end_small_data_table)
+ .byte 0xff,0xff,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0b,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0c,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0d,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0e,0x0f
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x0f
+CONST_VAR_END(tbx_start_small_data_table)
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S
new file mode 100644
index 000000000..9f1ff80fb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_128.S
@@ -0,0 +1,30 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_128.S"
+#include "gcm_enc_dec.S"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S
new file mode 100644
index 000000000..f3cc2b802
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_enc_dec_256.S
@@ -0,0 +1,30 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_256.S"
+#include "gcm_enc_dec.S"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S
new file mode 100644
index 000000000..e635d7e70
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_128.S
@@ -0,0 +1,30 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_128.S"
+#include "gcm_precomp.S" \ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S
new file mode 100644
index 000000000..52b76a6a2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_precomp_256.S
@@ -0,0 +1,30 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_256.S"
+#include "gcm_precomp.S" \ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S
new file mode 100644
index 000000000..42c48d9a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_128.S
@@ -0,0 +1,32 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_128.S"
+#include "gcm_update.S"
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S
new file mode 100644
index 000000000..1c2c33b48
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/aes_gcm_update_256.S
@@ -0,0 +1,32 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "gcm_common_256.S"
+#include "gcm_update.S"
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c
new file mode 100644
index 000000000..1a2077356
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_aarch64_dispatcher.c
@@ -0,0 +1,108 @@
+/**********************************************************************
+ Copyright(c) 2020-2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+#undef PROVIDER_BASIC
+#define PROVIDER_BASIC(a) (void*)0
+
+static unsigned long is_crypto_available(void)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ return (auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES);
+}
+
+#define DEFINE_CBC_INTERFACE_DISPATCHER(func,mode,suffix) \
+ DEFINE_INTERFACE_DISPATCHER(aes_cbc_##func##_##mode) \
+ { \
+ if (is_crypto_available()) \
+ return PROVIDER_INFO(aes_cbc_##func##_##mode##_##suffix); \
+ return PROVIDER_BASIC(aes_cbc_##func##_##mode); \
+ }
+
+DEFINE_CBC_INTERFACE_DISPATCHER(enc, 128, aes);
+DEFINE_CBC_INTERFACE_DISPATCHER(enc, 192, aes);
+DEFINE_CBC_INTERFACE_DISPATCHER(enc, 256, aes);
+
+/*
+ * AES-CBC decryption can be parallelised according to algorithm. Decryption
+ * flow is to do decrypt and then EOR previous input data or IV(first).
+ * So, decryption can be parallelised and EOR all data as output data.
+ *
+ * The unroll factor depends on micro architecture. The factors of N1, A57 and A72
+ * are based on optimization guide and test results. Other platforms are based on
+ * ThunderX2 test results.
+ *
+ */
+DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_128)
+{
+ if (is_crypto_available()) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+ return PROVIDER_INFO(aes_cbc_dec_128_aes_1);
+ case MICRO_ARCH_ID(ARM, CORTEX_A57):
+ return PROVIDER_INFO(aes_cbc_dec_128_aes_4);
+ case MICRO_ARCH_ID(ARM, CORTEX_A72):
+ return PROVIDER_INFO(aes_cbc_dec_128_aes_6);
+ }
+ return PROVIDER_INFO(aes_cbc_dec_128_aes_5);
+ }
+ return PROVIDER_BASIC(aes_cbc_dec_128);
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_192)
+{
+ if (is_crypto_available()) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+ return PROVIDER_INFO(aes_cbc_dec_192_aes_1);
+ case MICRO_ARCH_ID(ARM, CORTEX_A57):
+ return PROVIDER_INFO(aes_cbc_dec_192_aes_5);
+ case MICRO_ARCH_ID(ARM, CORTEX_A72):
+ return PROVIDER_INFO(aes_cbc_dec_192_aes_4);
+ }
+ return PROVIDER_INFO(aes_cbc_dec_192_aes_5);
+ }
+ return PROVIDER_BASIC(aes_cbc_dec_192);
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_cbc_dec_256)
+{
+ if (is_crypto_available()) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+ return PROVIDER_INFO(aes_cbc_dec_256_aes_1);
+ case MICRO_ARCH_ID(ARM, CORTEX_A57):
+ return PROVIDER_INFO(aes_cbc_dec_256_aes_5);
+ case MICRO_ARCH_ID(ARM, CORTEX_A72):
+ return PROVIDER_INFO(aes_cbc_dec_256_aes_6);
+ }
+ return PROVIDER_INFO(aes_cbc_dec_256_aes_5);
+ }
+ return PROVIDER_BASIC(aes_cbc_dec_256);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S
new file mode 100644
index 000000000..6f793843a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_common.S
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#define FN_NAME(fn,mode,post) aes_cbc_##fn##_##mode##_##post
+#define LABEL_NAME(fn,mode,post) .L##fn##_##mode##_##post
+#define START_FUNC(fn,mode,post) .global FN_NAME(fn,mode,post); \
+ .type FN_NAME(fn,mode,post), %function; \
+ FN_NAME(fn,mode,post):
+#define END_FUNC(fn,mode,post) .size FN_NAME(fn,mode,post), .-FN_NAME(fn,mode,post)
+.macro declare_var_vector_reg name:req,reg:req
+.ifdef q\name
+ .unreq q\name
+ .unreq v\name
+ .unreq s\name
+ .unreq d\name
+.endif
+ .set q\name , \reg
+ q\name .req q\reg
+ v\name .req v\reg
+ s\name .req s\reg
+ d\name .req d\reg
+.endm
+
+.macro declare_var_generic_reg name:req,reg:req
+ \name .req x\reg
+ x\name .req x\reg
+ w\name .req w\reg
+.endm \ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S
new file mode 100644
index 000000000..11bd90a71
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_dec_aes.S
@@ -0,0 +1,482 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+#include "cbc_common.S"
+ .altmacro
+.macro _aes_decrypt_round block:req,key:req
+ aesd v\block\().16b,vKey\key\().16b
+ .if \key < 13
+ aesimc v\block\().16b,v\block\().16b
+ .endif
+ .if \key > 13
+ .error "erro her"
+ .endif
+.endm
+
+.macro aes_decrypt_round block,reg,key
+ _aes_decrypt_round In\reg\()_\block,\key
+.endm
+
+.macro load_keys first_key
+ .if \first_key == 4
+ ld1 {vKey4.4s -vKey6.4s},[keys],3*16
+ .endif
+ .ifc 2 , \first_key
+ ldr qKey2,[keys],1*16
+ ld1 {vKey3.16b -vKey6.16b},[keys],4*16
+ .endif
+ .ifc 0 , \first_key
+ ld1 {vKey0.16b -vKey2.16b},[keys],3*16
+ ld1 {vKey3.16b -vKey6.16b},[keys],4*16
+ .endif
+ ld1 {vKey7.16b -vKey10.16b},[keys],4*16
+ ld1 {vKey11.16b-vKey14.16b},[keys],4*16
+.endm
+
+.macro aes_decrypt_blocks_round blocks,key_idx,key_reg,next_keyreg,first_idx
+ .if \key_idx == 12
+ ldr q\next_keyreg,[keys],(\first_idx-13)*16
+ .else
+ ldr q\next_keyreg,[keys],16
+ .endif
+ n=0
+ .rept \blocks
+ _aes_decrypt_round %n,\key_reg
+ n=n+1
+ .endr
+.endm
+
+.macro aes_decrypt_rounds blocks,key_st,key_end,first_idx
+ j=key_st
+ .rept \key_end - \key_st + 1
+ aes_decrypt_blocks_round \blocks,%j,%(j%2),%((j+1)%2),\first_idx
+ j=j+1
+ .endr
+.endm
+
+.macro aes_cbc_decrypt_rounds blocks,first_idx,reg,next_reg
+ aes_decrypt_rounds \blocks,\first_idx,12,\first_idx
+.endm
+
+.macro declare_prefix idx,reg,prefix
+ declare_var_vector_reg \prefix\()\idx,\reg
+.endm
+
+.macro mldr reg,block,addr
+ ldr qIn\reg\()_\block,[\addr],16
+.endm
+
+.macro mldrin reg,blocks,addr
+ .if \blocks == 1
+ ldr qIn\reg\()_0,[\addr],16
+ .exitm
+ .endif
+ .if \blocks == 2
+ ldp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16
+ .exitm
+ .endif
+ .if \blocks == 3
+ ldr qIn\reg\()_0,[\addr],16
+ ldp qIn\reg\()_1,qIn\reg\()_2,[\addr],2*16
+ .exitm
+ .endif
+ .if \blocks == 4
+ ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+ .exitm
+ .endif
+ .if \blocks == 5
+ ldr qIn\reg\()_0,[\addr],16
+ ld1 {vIn\reg\()_1.16b-vIn\reg\()_4.16b},[\addr],4*16
+ .exitm
+ .endif
+ .if \blocks == 6
+ ldp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16
+ ld1 {vIn\reg\()_2.16b-vIn\reg\()_5.16b},[\addr],4*16
+ .exitm
+ .endif
+ .if \blocks == 7
+ ld1 {vIn\reg\()_0.16b-vIn\reg\()_2.16b},[\addr],3*16
+ ld1 {vIn\reg\()_3.16b-vIn\reg\()_6.16b},[\addr],4*16
+ .exitm
+ .endif
+
+ .if \blocks == 8
+ ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+ ld1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16
+ .exitm
+ .endif
+ .if \blocks == 9
+ ld1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+ ld1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16
+ ldr qIn\reg\()_8,[\addr],16
+ .exitm
+ .endif
+.endm
+
+.macro mstrout reg,blocks,addr
+ .if \blocks == 1
+ str qIn\reg\()_0,[\addr],16
+ .exitm
+ .endif
+ .if \blocks == 2
+ stp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16
+ .exitm
+ .endif
+ .if \blocks == 3
+ str qIn\reg\()_0,[\addr],16
+ stp qIn\reg\()_1,qIn\reg\()_2,[\addr],2*16
+ .exitm
+ .endif
+ .if \blocks == 4
+ st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+ .exitm
+ .endif
+ .if \blocks == 5
+ str qIn\reg\()_0,[\addr],16
+ st1 {vIn\reg\()_1.16b-vIn\reg\()_4.16b},[\addr],4*16
+ .exitm
+ .endif
+ .if \blocks == 6
+ stp qIn\reg\()_0,qIn\reg\()_1,[\addr],2*16
+ st1 {vIn\reg\()_2.16b-vIn\reg\()_5.16b},[\addr],4*16
+ .exitm
+ .endif
+ .if \blocks == 7
+ st1 {vIn\reg\()_0.16b-vIn\reg\()_2.16b},[\addr],3*16
+ st1 {vIn\reg\()_3.16b-vIn\reg\()_6.16b},[\addr],4*16
+ .exitm
+ .endif
+
+ .if \blocks == 8
+ st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+ st1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16
+ .exitm
+ .endif
+ .if \blocks == 9
+ st1 {vIn\reg\()_0.16b-vIn\reg\()_3.16b},[\addr],4*16
+ st1 {vIn\reg\()_4.16b-vIn\reg\()_7.16b},[\addr],4*16
+ str qIn\reg\()_8,[\addr],16
+ .exitm
+ .endif
+.endm
+
+.macro eorkey14 block,reg
+ eor vBlock\block\().16b,vKey14.16b,vState\reg\()_\block\().16b
+.endm
+
+.macro eorblock block,reg
+ eor vIn\reg\()_\block\().16b,vBlock\block\().16b,vIn\reg\()_\block\().16b
+.endm
+
+.macro movstate0 block,reg
+ mov vState\reg\()_0.16b,vIn\reg\()_\block\().16b
+.endm
+
+.macro cbc_decrypt_rounds blocks,reg,first_key,cur_blocks
+ .ifb \cur_blocks
+ _blocks=\blocks
+ .else
+ _blocks=\cur_blocks
+ .endif
+ key=\first_key + 1
+ .if 3*\blocks+1 >= 32-15+\first_key
+ ldr_key %key,\first_key
+ .endif
+ n=0
+ .rept _blocks - 1
+ eorkey14 %((n+1)%_blocks),\reg
+ aes_decrypt_round %n,\reg,\first_key
+ n=n+1
+ .endr
+ eorkey14 0,\reg
+ movstate0 %(_blocks-1),\reg
+ aes_decrypt_round %n,\reg,\first_key
+
+ k=0
+ .rept 15-\first_key-3
+ n=0
+ .if 3*\blocks+1 >= 32-15+\first_key
+ ldr_key %(key+k+1),\first_key
+ .endif
+
+ .rept _blocks
+ aes_decrypt_round %n,\reg,%(key+k)
+ n=n+1
+ .endr
+ k=k+1
+ .endr
+ n=0
+ .if 3*\blocks+1 >= 32-15+\first_key
+ ldr_key \first_key,\first_key
+ .endif
+ .rept _blocks
+ aes_decrypt_round %n,\reg,13
+ eorblock %n,\reg
+ n=n+1
+ .endr
+.endm
+
+.macro print_macro a,b,c,d,e
+ .print "print_macro,\a \b \c \d \e"
+.endm
+
+.macro remainder_process blocks,first_key,curblk
+.if \blocks > (1<<\curblk)
+ tbz xlen_remainder,\curblk,1f
+ mldrin 0,%(1<<\curblk),in
+ cbc_decrypt_rounds \blocks,0,\first_key,%(1<<\curblk)
+ mstrout 0,%(1<<\curblk),out
+1:
+.endif
+.endm
+
+.macro aes_cbc_decrypt_blocks first_key,blocks
+ division \blocks, len_bytes,len_remainder,tmp0,tmp1
+ mov xlen_quotient_in,xlen_quotient
+ /*
+ input regs(2*\block) + tmp regs(\blocks) + State reg(1)
+ + key regs(15-\first_key) < 32
+ */
+ .if 3*\blocks+1 < 32-15+\first_key
+ n=\first_key
+ .rept 15-\first_key
+ declare_prefix %n,%(n+17),Key
+ n=n+1
+ .endr
+ load_keys \first_key
+ .else
+ n=\first_key
+ .rept 14-\first_key
+ declare_prefix %n,%((n%2)+29),Key
+ n=n+1
+ .endr
+ declare_prefix 14,31,Key
+ /* load first key */
+ ldr_key \first_key,\first_key
+ /* load last key */
+ ldr_key 14,\first_key
+ .endif
+ m=\blocks
+ l=\blocks-1
+ declare_prefix 0,0,State0_
+ declare_prefix 0,0,State1_
+ n=0
+ .rept \blocks
+ declare_prefix %n,%(n+1),In0_
+ declare_prefix %n,%(n+m+1),In1_
+ declare_prefix %n,%(n+2*m+1),Block
+ n=n+1
+ .endr
+ n=1
+ .rept \blocks -1
+ declare_prefix %n,%(n),State0_
+ declare_prefix %n,%(n+m),State1_
+ n=n+1
+ .endr
+ ldr qState0_0,[IV]
+ cbz xlen_quotient,9f
+ mldrin 0,\blocks,in
+ sub xlen_quotient_in,xlen_quotient_in,1
+ b 5f
+
+3:
+ sub xlen_quotient,xlen_quotient,1
+ mstrout 1,\blocks,out
+ cbz xlen_quotient,9f
+5:
+ cbz xlen_quotient_in,1f
+ mldrin 1,\blocks,in
+ sub xlen_quotient_in,xlen_quotient_in,1
+1:
+ cbc_decrypt_rounds \blocks,0,\first_key
+ sub xlen_quotient,xlen_quotient,1
+ mstrout 0,\blocks,out
+ cbz xlen_quotient,9f
+
+ cbz xlen_quotient_in,1f
+ mldrin 0,\blocks,in
+ sub xlen_quotient_in,xlen_quotient_in,1
+1:
+ cbc_decrypt_rounds \blocks,1,\first_key
+ b 3b
+9:
+ remainder_process \blocks,\first_key,3
+ remainder_process \blocks,\first_key,2
+ remainder_process \blocks,\first_key,1
+ remainder_process \blocks,\first_key,0
+.endm
+
+
+.macro division blocks,quotient,remainder,tmp0,tmp1
+ .if \blocks == 1
+ mov x\remainder, 0
+ .exitm
+ .endif
+ .if \blocks == 2
+ and x\remainder, x\quotient, 1
+ lsr x\quotient, x\quotient, 1
+ .exitm
+ .endif
+ .if \blocks == 3
+ mov x\tmp0, -6148914691236517206
+ mov x\remainder, x\quotient
+ movk x\tmp0, 0xaaab, lsl 0
+ umulh x\tmp0, x\quotient, x\tmp0
+ and x\tmp1, x\tmp0, -2
+ lsr x\quotient, x\tmp0, 1
+ add x\tmp1, x\tmp1, x\quotient
+ sub x\remainder, x\remainder, x\tmp1
+ .exitm
+ .endif
+ .if \blocks == 4
+ and x\remainder, x\quotient, 3
+ lsr x\quotient, x\quotient, 2
+ .exitm
+ .endif
+ .if \blocks == 5
+ mov x\tmp0, -3689348814741910324
+ mov x\remainder, x\quotient
+ movk x\tmp0, 0xcccd, lsl 0
+ umulh x\tmp0, x\quotient, x\tmp0
+ and x\tmp1, x\tmp0, -4
+ lsr x\quotient, x\tmp0, 2
+ add x\tmp1, x\tmp1, x\quotient
+ sub x\remainder, x\remainder, x\tmp1
+ .exitm
+ .endif
+ .if \blocks == 6
+ mov x\tmp0, -6148914691236517206
+ mov x\tmp1, x\quotient
+ movk x\tmp0, 0xaaab, lsl 0
+ umulh x\tmp0, x\quotient, x\tmp0
+ lsr x\quotient, x\tmp0, 2
+ add x\remainder, x\quotient, x\quotient, lsl 1
+ sub x\remainder, x\tmp1, x\remainder, lsl 1
+ .exitm
+ .endif
+ .if \blocks == 7
+ mov x\tmp0, 9363
+ mov x\tmp1, x\quotient
+ movk x\tmp0, 0x9249, lsl 16
+ movk x\tmp0, 0x4924, lsl 32
+ movk x\tmp0, 0x2492, lsl 48
+ umulh x\quotient, x\quotient, x\tmp0
+ sub x\tmp0, x\tmp1, x\quotient
+ add x\tmp0, x\quotient, x\tmp0, lsr 1
+ lsr x\quotient, x\tmp0, 2
+ lsl x\remainder, x\quotient, 3
+ sub x\remainder, x\remainder, x\quotient
+ sub x\remainder, x\tmp1, x\remainder
+ .exitm
+ .endif
+ .if \blocks == 8
+ and x\remainder, x\quotient, 7
+ lsr x\quotient, x\quotient, 3
+ .exitm
+ .endif
+ .if \blocks == 9
+ mov x\tmp0, 58255
+ mov x\remainder, x\quotient
+ movk x\tmp0, 0x8e38, lsl 16
+ movk x\tmp0, 0x38e3, lsl 32
+ movk x\tmp0, 0xe38e, lsl 48
+ umulh x\tmp0, x\quotient, x\tmp0
+ and x\tmp1, x\tmp0, -8
+ lsr x\quotient, x\tmp0, 3
+ add x\tmp1, x\tmp1, x\quotient
+ sub x\remainder, x\remainder, x\tmp1
+ .exitm
+ .endif
+.endm
+
+.macro ldr_key num,first_key
+ ldr qKey\num,[keys,16*(\num - \first_key)]
+.endm
+#ifndef CBC_DECRYPT_BLOCKS_NUM
+#define CBC_DECRYPT_BLOCKS_NUM 8
+#endif
+
+.macro cbc_decrypt first_key:req,blocks
+ lsr xlen_bytes,xlen_bytes,4
+ cbz xlen_bytes,10f
+ push_stack
+ aes_cbc_decrypt_blocks \first_key,\blocks
+ pop_stack
+10:
+.endm
+
+.set stack_size,64
+.macro push_stack
+ stp d8, d9,[sp,-stack_size]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+.endm
+
+.macro pop_stack
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], stack_size
+.endm
+
+/*
+void aes_cbc_dec_128(
+ void *in, //!< Input cipher text
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+ void *out, //!< Output plain text
+ uint64_t len_bytes //!< Must be a multiple of 16 bytes
+ );
+*/
+ declare_var_generic_reg in ,0
+ declare_var_generic_reg IV ,1
+ declare_var_generic_reg keys ,2
+ declare_var_generic_reg out ,3
+ declare_var_generic_reg len_bytes ,4
+ declare_var_generic_reg len_quotient,4
+ declare_var_generic_reg len_remainder,5
+ declare_var_generic_reg tmp0 ,6
+ declare_var_generic_reg tmp1 ,7
+ declare_var_generic_reg len_quotient_in,6
+
+.macro define_aes_cbc_dec_func mode:req,blocks:req
+ .global aes_cbc_dec_\mode\()_aes_\blocks
+aes_cbc_dec_\mode\()_aes_\blocks:
+ cbc_decrypt %((256-mode)/32),\blocks
+ ret
+ .size aes_cbc_dec_\mode\()_aes_\blocks, . - aes_cbc_dec_\mode\()_aes_\blocks
+.endm
+
+.irp blocks,1,2,3,4,5,6,7,8,9
+ define_aes_cbc_dec_func 128,\blocks
+ define_aes_cbc_dec_func 192,\blocks
+ define_aes_cbc_dec_func 256,\blocks
+.endr
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S
new file mode 100644
index 000000000..8eb5e507d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_enc_aes.S
@@ -0,0 +1,157 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+
+#include "cbc_common.S"
+
+ declare_var_vector_reg Key0 ,17
+ declare_var_vector_reg Key1 ,18
+ declare_var_vector_reg Key2 ,19
+ declare_var_vector_reg Key3 ,20
+ declare_var_vector_reg Key4 ,21
+ declare_var_vector_reg Key5 ,22
+ declare_var_vector_reg Key6 ,23
+ declare_var_vector_reg Key7 ,24
+ declare_var_vector_reg Key8 ,25
+ declare_var_vector_reg Key9 ,26
+ declare_var_vector_reg Key10 ,27
+ declare_var_vector_reg Key11 ,28
+ declare_var_vector_reg Key12 ,29
+ declare_var_vector_reg Key13 ,30
+ declare_var_vector_reg Key14 ,31
+
+.macro aes_encrypt_round block,key
+ aese v\block\().16b,vKey\key\().16b
+ .if \key < 13
+ aesmc v\block\().16b,v\block\().16b
+ .endif
+.endm
+
+.macro aes_encrypt_round_name block,key
+ aese v\block\().16b,v\key\().16b
+ aesmc v\block\().16b,v\block\().16b
+.endm
+
+
+
+.set stack_size,64
+.macro push_stack
+ stp d8, d9,[sp,-stack_size]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+.endm
+
+.macro pop_stack
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], stack_size
+.endm
+/*
+void aes_cbc_dec_128(
+ void *in, //!< Input cipher text
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+ void *out, //!< Output plain text
+ uint64_t len_bytes //!< Must be a multiple of 16 bytes
+ );
+*/
+ declare_var_generic_reg in ,0
+ declare_var_generic_reg IV ,1
+ declare_var_generic_reg keys ,2
+ declare_var_generic_reg out ,3
+ declare_var_generic_reg len_bytes ,4
+
+ declare_var_vector_reg State ,0
+ declare_var_vector_reg FirstKey ,1
+ declare_var_vector_reg Block ,2
+ declare_var_vector_reg ConstKey ,3
+.macro load_key num
+ ldr qKey\num,[keys],16
+.endm
+.altmacro
+.macro cbc_encrypt first:req
+ lsr xlen_bytes,xlen_bytes,4
+ cbz xlen_bytes,3f
+ ldr qState,[IV]
+ ldr qKey\first,[keys],16
+ .set lastkey_off,13-\first
+ ldr qKey14,[keys,lastkey_off*16]
+ ldr qBlock,[in],16
+ n=\first
+ second=1+\first
+ .rept 5-n
+ n=n+1
+ load_key %n
+ .endr
+ ld1 {vKey6.4s - vKey9.4s},[keys],4*16
+ eor vBlock.16b,vBlock.16b ,vState.16b
+ eor vConstKey.16b,vKey\first\().16b,vKey14.16b
+ aes_encrypt_round Block,\first
+ ld1 {vKey10.4s - vKey13.4s},[keys]
+ b 1f
+2:
+ aes_encrypt_round Block,\first
+ str qState,[out],16
+1:
+ sub xlen_bytes,xlen_bytes,1
+ aes_encrypt_round Block,%second
+ cbz xlen_bytes,1f
+ ldr qKey\first,[in],16
+1:
+ n=second
+ .rept 12-n
+ n=n+1
+ aes_encrypt_round Block,%n
+ .endr
+
+ eor vKey\first\().16b,vKey\first\().16b,vConstKey.16b
+ aes_encrypt_round Block,13
+ eor vState.16b,vBlock.16b,vKey14.16b
+ cbnz xlen_bytes,2b
+ str qState,[out]
+3:
+
+.endm
+START_FUNC(enc,128,aes)
+ cbc_encrypt 4
+ ret
+END_FUNC(enc,128,aes)
+
+START_FUNC(enc,192,aes)
+ cbc_encrypt 2
+ ret
+END_FUNC(enc,192,aes)
+
+START_FUNC(enc,256,aes)
+ cbc_encrypt 0
+ ret
+END_FUNC(enc,256,aes) \ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S
new file mode 100644
index 000000000..fba533754
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/cbc_multibinary_aarch64.S
@@ -0,0 +1,38 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface aes_cbc_dec_128
+mbin_interface aes_cbc_dec_192
+mbin_interface aes_cbc_dec_256
+
+mbin_interface aes_cbc_enc_128
+mbin_interface aes_cbc_enc_192
+mbin_interface aes_cbc_enc_256
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c
new file mode 100644
index 000000000..f8188e3ae
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_aarch64_dispatcher.c
@@ -0,0 +1,255 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+#undef PROVIDER_BASIC
+#define PROVIDER_BASIC(a) (void*)0
+
+static unsigned long is_crypto_available(void)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ return (auxval & (HWCAP_ASIMD | HWCAP_AES | HWCAP_PMULL)) ==
+ (HWCAP_ASIMD | HWCAP_AES | HWCAP_PMULL);
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_128_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_128_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_precomp_128)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_precomp_128_aes);
+
+ return PROVIDER_BASIC(aes_gcm_precomp_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_256_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_256);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_256_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_256);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_precomp_256)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_precomp_256_aes);
+
+ return PROVIDER_BASIC(aes_gcm_precomp_256);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_update)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_128_update_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_128_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_finalize)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_128_finalize_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_128_finalize);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_update)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_128_update_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_128_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_finalize)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_128_finalize_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_128_finalize);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_update)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_256_update_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_256_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_finalize)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_256_finalize_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_256_finalize);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_update)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_256_update_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_256_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_finalize)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_256_finalize_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_256_finalize);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_init_256)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_init_256_aes);
+
+ return PROVIDER_BASIC(aes_gcm_init_256);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_init_128)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_init_128_aes);
+
+ return PROVIDER_BASIC(aes_gcm_init_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_nt)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_128_nt_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_128_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_128_update_nt)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_128_update_nt_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_128_update_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_nt)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_128_nt_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_128_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_128_update_nt)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_128_update_nt_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_128_update_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_nt)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_256_nt_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_256_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_enc_256_update_nt)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_enc_256_update_nt_aes);
+
+ return PROVIDER_BASIC(aes_gcm_enc_256_update_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_nt)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_256_nt_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_256_nt);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_gcm_dec_256_update_nt)
+{
+ if (is_crypto_available())
+ return PROVIDER_INFO(aes_gcm_dec_256_update_nt_aes);
+
+ return PROVIDER_BASIC(aes_gcm_dec_256_update_nt);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S
new file mode 100644
index 000000000..042f6cf19
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common.S
@@ -0,0 +1,430 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+#define HASHKEY_TOTAL_NUM (24)
+#define HASHKEY_BASE_OFF (15*16)
+#define HASHKEY_OFF(n) ((15*16)+n*32)
+#define HASHKEY_EXT_OFF(n) ((15*16)+n*32+16)
+#ifndef KEY_LEN
+#define KEY_LEN 128
+#endif
+#ifndef BLOCKS
+#define BLOCKS 24
+#endif
+#define FN_NAME(fn,mode,post) aes_gcm_##fn##_##mode####post##aes
+#define START_FUNC(fn,mode,post) .global FN_NAME(fn,mode,post); \
+ .type FN_NAME(fn,mode,post), %function; \
+ FN_NAME(fn,mode,post):
+#define END_FUNC(fn,mode,post) .size FN_NAME(fn,mode,post), .-FN_NAME(fn,mode,post)
+
+#define AAD_LEN_OFF 16
+#define IN_LENGTH_OFF 24
+#define PARTIAL_BLOCK_ENC_KEY_OFF 32
+#define PARTIAL_BLOCK_LENGTH_OFF 80
+#define CTR_OFF 64
+#define ORIG_IV_OFF 48
+/*
+ [low,middle,tmp0,high] +=dat0 * [hashkey0,hashkey0_ext]
+ ifnb dat1
+ dat1=rbit(*dat_adr)
+ [hashkey0,hashkey0_ext] = *hashkey_adr
+ dat_adr+=16
+ hashkey_adr+=32
+*/
+
+.macro ghash_mult_round aadhash:req,dat_adr:req,hashkey_adr:req, \
+ hashkey0:req,hashkey0_ext:req,high:req,low:req,middle:req, \
+ tmp0:req,tmp1:req,next_dat:req,left_count:req
+
+ ldr q\next_dat,[\dat_adr],16
+ pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+ pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+ .if \left_count > 1
+ ldr q\hashkey0_ext,[\hashkey_adr,16]
+ .endif
+ eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b
+ pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d
+ eor v\middle\().16b,v\middle\().16b,v\tmp1\().16b
+ pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d
+ .if \left_count > 1
+ ldr q\hashkey0,[\hashkey_adr],32
+ .endif
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+ rbit v\aadhash\().16b, v\next_dat\().16b
+.endm
+
+.macro ghash_mult_init_round aadhash:req,dat_adr:req,hashkey_adr:req, \
+ hashkey0:req,hashkey0_ext:req, \
+ high:req,low:req,middle:req,tmp0:req,next_dat:req,left_count:req
+ ldp q\hashkey0,q\hashkey0_ext,[\hashkey_adr],32
+ ldr q\next_dat,[\dat_adr],16
+ pmull v\middle\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+ pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+ .if \left_count > 1
+ ldr q\hashkey0_ext,[\hashkey_adr,16]
+ .endif
+ pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d
+ eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b
+
+ pmull v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d
+ .if \left_count > 1
+ ldr q\hashkey0,[\hashkey_adr],32
+ .endif
+ rbit v\aadhash\().16b, v\next_dat\().16b
+.endm
+
+/* aadhash=reduction(low,middle,high)+dat0 */
+.macro ghash_mult_final_round aadhash:req, \
+ high:req,low:req,middle:req,tmp0:req, \
+ zero:req,poly:req
+
+ ext v\tmp0\().16b,v\middle\().16b,v\zero\().16b,8 /*high*/
+ ext v\middle\().16b,v\zero\().16b,v\middle\().16b,8 /*low */
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ eor v\low\().16b,v\low\().16b,v\middle\().16b
+
+ pmull2 v\middle\().1q,v\high\().2d,v\poly\().2d
+
+ ext v\tmp0\().16b,v\middle\().16b,v\zero\().16b,8 /*high*/
+ ext v\middle\().16b,v\zero\().16b,v\middle\().16b,8 /*low*/
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ eor v\low\().16b,v\low\().16b,v\middle\().16b
+ pmull v\middle\().1q,v\high\().1d,v\poly\().1d
+ eor v\tmp0\().16b, v\low\().16b, v\middle\().16b
+ eor v\aadhash\().16b, v\aadhash\().16b, v\tmp0\().16b
+.endm
+.macro ghash_reset_hashkey_addr hashkey_addr:req,hashkey_base:req,count:req
+ add \hashkey_addr,\hashkey_base,(24-\count)<<5
+.endm
+
+
+.macro ghash_block_n count:req,aadhash:req, dat:req,dat_addr:req, hashkey_addr:req, hashkey_base:req, \
+ hashkey:req,hashkey_ext:req,high:req,low:req,middle:req, zero:req,poly:req, \
+ tmp0:req,tmp1:req
+
+ ghash_reset_hashkey_addr \hashkey_addr,\hashkey_base,\count
+ ghash_mult_init_round \aadhash,\dat_addr,\hashkey_addr,\hashkey,\hashkey_ext, \
+ \high,\low,\middle,\tmp0,\dat,\count
+ .set left_count,\count - 1
+ .rept left_count
+ ghash_mult_round \aadhash,\dat_addr,\hashkey_addr,\hashkey,\hashkey_ext, \
+ \high,\low,\middle,\tmp0,\tmp1,\dat, left_count
+ .set left_count,left_count - 1
+
+ .endr
+ ghash_mult_final_round \aadhash,\high,\low,\middle,\tmp0,\zero,\poly
+.endm
+
+/*
+ aadhash=aadhash*[hashkey,hashkey_ext] + rbit(dat)
+*/
+.macro ghash_block_reg aadhash:req, dat:req, \
+ hashkey:req,hashkey_ext:req,high:req,low:req,middle:req, zero:req,poly:req, \
+ tmp0:req
+ pmull v\middle\().1q,v\aadhash\().1d,v\hashkey_ext\().1d
+ pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey_ext\().2d
+ pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey\().2d
+ eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b
+ pmull v\low\().1q,v\aadhash\().1d,v\hashkey\().1d
+ rbit v\aadhash\().16b, v\dat\().16b
+ ghash_mult_final_round \aadhash,\high,\low,\middle,\tmp0,\zero,\poly
+.endm
+
+.macro ghash_mult_round_noload aadhash:req, \
+ hashkey0:req,hashkey0_ext:req,high:req,low:req,middle:req, \
+ tmp0:req,tmp1:req
+
+ pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+ pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+ eor v\middle\().16b,v\middle\().16b,v\tmp0\().16b
+ pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d
+ eor v\middle\().16b,v\middle\().16b,v\tmp1\().16b
+ pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+
+.endm
+
+/* aadhash=reduction([low,high],poly)+dat0 */
+.macro poly_mult_final_x2 aadhash:req, \
+ high:req,low:req,tmp0:req,tmp1:req, \
+ poly:req
+ pmull2 v\tmp1\().1q,v\high\().2d,v\poly\().2d
+ eor v\low\().16b, v\aadhash\().16b, v\low\().16b
+ eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+ ext v\tmp0\().16b,v\tmp1\().16b,v\aadhash\().16b,8 //high
+ ext v\tmp1\().16b,v\aadhash\().16b,v\tmp1\().16b,8 //low
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+ pmull v\tmp1\().1q,v\high\().1d,v\poly\().1d
+ eor v\aadhash\().16b, v\low\().16b, v\tmp1\().16b
+.endm
+
+.macro aes_encrypt_round block,key
+ aese v\block\().16b,v\key\().16b
+ aesmc v\block\().16b,v\block\().16b
+.endm
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name .req q\reg
+ v\name .req v\reg
+ s\name .req s\reg
+ d\name .req d\reg
+.endm
+
+.macro declare_var_generic_reg name:req,reg:req
+ \name .req x\reg
+ x\name .req x\reg
+ w\name .req w\reg
+.endm
+
+/*Read data less than 16 */
+.macro read_small_data dest:req,src:req,size:req,tbl_adr:req,tbl:req
+ ldr q\tbl,[\tbl_adr,\size,lsl 4]
+ tbz \size,3,1f
+ ld1 {v\dest\().d}[0],[\src],8
+1:
+ tbz \size,2,1f
+ ld1 {v\dest\().s}[2],[\src],4
+1:
+ tbz \size,1,1f
+ ld1 {v\dest\().h}[6],[\src],2
+1:
+ tbz \size,0,1f
+ ld1 {v\dest\().b}[14],[\src],1
+1:
+ tbl v\dest\().16b,{v\dest\().16b},v\tbl\().16b
+.endm
+.macro read_small_data_start dest:req,src:req,size:req,tbl_adr:req,tbl:req
+ adrp \tbl_adr,:got:read_small_data_table
+ ldr \tbl_adr,[\tbl_adr,#:got_lo12:read_small_data_table]
+ read_small_data \dest,\src,\size,\tbl_adr,\tbl
+.endm
+
+.macro read_small_data_end dest:req,src:req,size:req,tbl_adr:req,tbl:req
+ adrp \tbl_adr,:got:read_end_small_data_table
+ ldr \tbl_adr,[\tbl_adr,#:got_lo12:read_end_small_data_table]
+ read_small_data \dest,\src,\size,\tbl_adr,\tbl
+.endm
+
+.macro write_small_data src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+ ldr q\tmp1,[\tbl_adr,\size,lsl 4]
+ tbl v\tmp1\().16b,{v\src\().16b},v\tmp1\().16b
+ tbz \size,3,1f
+ st1 {v\tmp1\().d}[0],[\dest],8
+1:
+ tbz \size,2,1f
+ st1 {v\tmp1\().s}[2],[\dest],4
+1:
+ tbz \size,1,1f
+ st1 {v\tmp1\().h}[6],[\dest],2
+1:
+ tbz \size,0,1f
+ st1 {v\tmp1\().b}[14],[\dest],1
+1:
+.endm
+.macro write_small_data_start src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+ adrp \tbl_adr,:got:write_small_data_table
+ ldr \tbl_adr,[\tbl_adr,#:got_lo12:write_small_data_table]
+ write_small_data \src,\dest,\size,\tbl_adr,\tmp1
+.endm
+.macro write_small_data_end src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+ adrp \tbl_adr,:got:write_end_small_data_table
+ ldr \tbl_adr,[\tbl_adr,#:got_lo12:write_end_small_data_table]
+ write_small_data \src,\dest,\size,\tbl_adr,\tmp1
+.endm
+
+.macro tbx_small_data_end src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+ adrp \tbl_adr,:got:tbx_end_small_data_table
+ ldr \tbl_adr,[\tbl_adr,#:got_lo12:tbx_end_small_data_table]
+ ldr q\tmp1,[\tbl_adr,\size,lsl 4]
+ tbx v\dest\().16b,{v\src\().16b},v\tmp1\().16b
+.endm
+
+.macro tbx_small_data_start src:req,dest:req,size:req,tbl_adr:req,tmp1:req
+ adrp \tbl_adr,:got:tbx_start_small_data_table
+ ldr \tbl_adr,[\tbl_adr,#:got_lo12:tbx_start_small_data_table]
+ ldr q\tmp1,[\tbl_adr,\size,lsl 4]
+ tbx v\dest\().16b,{v\src\().16b},v\tmp1\().16b
+.endm
+
+
+.macro clear_small_data dest:req,zero:req,size:req,tbl_adr:req,tmp1:req
+ adrp \tbl_adr,:got:shift_small_data_table
+ ldr \tbl_adr,[\tbl_adr,#:got_lo12:shift_small_data_table]
+ add \tbl_adr,\tbl_adr,16
+ sub \tbl_adr,\tbl_adr,\size
+ ldr q\tmp1,[\tbl_adr]
+ tbx v\dest\().16b,{v\zero\().16b},v\tmp1\().16b
+.endm
+
+
+.macro aes_gcm_n_round is_enc:req,count:req,aadhash:req, dat_addr:req, \
+ hashkey_addr:req, hashkey_base:req, \
+ hashkey:req,hashkey_ext:req,high:req,low:req, poly:req, \
+ ctr:req,enc_ctr:req,one:req,out_adr:req, \
+ tmp0:req,tmp1:req
+
+ ghash_reset_hashkey_addr \hashkey_addr,\hashkey_base,\count
+
+ aes_gcm_init \is_enc,\aadhash,\dat_addr,\hashkey_addr, \
+ \hashkey,\hashkey_ext, \high,\low, \
+ \ctr,\enc_ctr,\one,\out_adr, \
+ \tmp0,\tmp1,\count
+
+ .set left_count,\count - 1
+ .rept left_count
+ aes_gcm_middle \is_enc,\aadhash,\dat_addr,\hashkey_addr, \
+ \hashkey,\hashkey_ext, \high,\low, \
+ \ctr,\enc_ctr,\one,\out_adr, \
+ \tmp0,\tmp1, left_count
+ .set left_count,left_count - 1
+ .endr
+
+ poly_mult_final_x2 \aadhash,\high,\low,\tmp0,\tmp1,\poly
+
+.endm
+
+
+/*
+ aadhash=aadhash*[hashkey_base[(TOTAL_HASHKEY_NUM-2),(TOTAL_HASHKEY_NUM-1)]] + rbit(dat)
+*/
+.macro ghash_block_reg_x2 aadhash:req, dat:req, hashkey_base:req, \
+ hashkey:req,high:req,low:req,tmp0:req, tmp1:req, \
+ tmp2:req,temp0:req
+ ldr q\hashkey,[\hashkey_base,(TOTAL_HASHKEY_NUM-1)*32+16]
+ eor v\tmp2\().16b,v\tmp2\().16b,v\tmp2\().16b,8 //zero
+ pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey\().1d
+ pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey\().2d
+ ldr q\hashkey,[\hashkey_base,(TOTAL_HASHKEY_NUM-1)*32]
+ eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+ ext v\tmp0\().16b,v\tmp0\().16b,v\tmp2\().16b,8 /*high*/
+ ext v\tmp1\().16b,v\tmp2\().16b,v\tmp0\().16b,8 /*low*/
+ pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey\().2d
+ mov temp0,0x87
+ pmull v\low\().1q,v\aadhash\().1d,v\hashkey\().1d
+ dup v\tmp2\().2d,x0
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+ rbit v\aadhash\().16b, v\dat\().16b
+ poly_mult_final_x2 \aadhash,\high,\low,\tmp0,\tmp1,\tmp2
+.endm
+
+.macro __generic_load_small_data is_enc:req,len_bit:req,small_read_len:req, \
+ in_adr:req,out_adr:req,partial_block:req,temp0:req,temp1:req,r:req,p
+ tbz \small_read_len,\len_bit,1f
+ ldr\p \r\()\temp0,[\in_adr],1<<\len_bit /*in */
+ ldr\p \r\()\temp1,[\partial_block] /* partial*/
+ eor \r\()\temp1,\r\()\temp0,\r\()\temp1
+ .ifc \is_enc ,decrypt
+ str\p \r\()\temp0,[\partial_block],1<<\len_bit
+ .endif
+ .ifc \is_enc, encrypt
+ str\p \r\()\temp1,[\partial_block],1<<\len_bit
+ .endif
+ str\p \r\()\temp1,[\out_adr],1<<\len_bit
+1:
+.endm
+.macro generic_load_partial_block is_enc:req,small_read_len:req,in_adr:req,out_adr:req, \
+ partial_block:req,temp0:req,temp1:req
+ __generic_load_small_data \is_enc,3,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,x /* small_read_len >=8 */
+ __generic_load_small_data \is_enc,2,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w /* small_read_len >=4 */
+ __generic_load_small_data \is_enc,1,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w,h /* small_read_len >=2 */
+ __generic_load_small_data \is_enc,0,\small_read_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1,w,b /* small_read_len >=1 */
+.endm
+/* without Neon read version */
+.macro generic_partial_block_start is_enc:req,in_len:req,in_adr:req,out_adr:req,context:req, \
+ partial_block:req,partial_block_len:req,small_read_len:req,left_partial_block_len:req, \
+ temp0:req
+ mov \left_partial_block_len,16
+ add \partial_block,\context,PARTIAL_BLOCK_ENC_KEY_OFF
+ sub \left_partial_block_len,\left_partial_block_len,\partial_block_len
+ add \partial_block,\partial_block,\partial_block_len
+ cmp \in_len,\left_partial_block_len
+ csel \small_read_len,\in_len,\left_partial_block_len, ls
+ add \partial_block_len,\partial_block_len,\small_read_len
+ sub \in_len,\in_len,\small_read_len
+ and \partial_block_len,\partial_block_len,0xf
+ str \partial_block_len,[\context,PARTIAL_BLOCK_LENGTH_OFF]
+ generic_load_partial_block \is_enc,\small_read_len,\in_adr,\out_adr,\partial_block, \
+ \left_partial_block_len,\temp0 /* small_read_len >=8 */
+.endm
+.macro generic_paritial_block_end is_enc:req,in_len:req,in_adr:req,out_adr:req,context:req, \
+ partial_block:req,temp0:req,temp1:req
+ str \in_len,[\context,PARTIAL_BLOCK_LENGTH_OFF]
+ add \partial_block,\context,PARTIAL_BLOCK_ENC_KEY_OFF
+ generic_load_partial_block \is_enc,\in_len,\in_adr,\out_adr,\partial_block,\temp0,\temp1 /* small_read_len >=8 */
+.endm
+/*partial_block_len+in_len < 16,partial_block_len=0,in_len>0 */
+.macro paritial_block_small_length is_enc:req,context:req,in_len:req,in_adr:req,out_adr:req,temp0:req,temp1:req,Ctr:req
+
+ cbz 1f
+ ldr \temp0,[\context,PARTIAL_BLOCK_LENGTH_OFF]
+ add \temp1,\temp0,\in_len
+ str \temp1,[\context,PARTIAL_BLOCK_LENGTH_OFF]
+ add \context,\temp0,PARTIAL_BLOCK_ENC_KEY_OFF
+2:/* loop start */
+ sub \in_len,\in_len,1
+ ldrb w\temp0,[\in_adr],1
+ ldrb w\temp1,[\context]
+ eor w\temp1,w\temp1,w\temp0
+ strb w\temp1,[\out_adr],1
+.ifc \is_enc , encrypt
+ strb w\temp1,[\context],1
+.endif
+.ifc \is_enc,decrypt
+ strb w\temp0,[\context],1
+.endif
+ cbnz \in_len,2b
+1:/* loop end */
+.endm
+
+/* 0<in_len < 16,partial_block_len=0 */
+.macro paritial_block_end is_enc:req,context:req,in_len:req,in_adr:req,out_adr:req, \
+ temp0:req,partial_block_len:req \
+ PartialBlock:req,ctr:req,one:req,Tmp2:req,Tmp3:req,Tmp4:req
+ add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr
+ str q\ctr,[context,CTR_OFF]
+ read_small_data_start \PartialBlock,\in_adr,\in_len,\tbl_adr,\Tmp0
+ aes_encrypt_block \ctr
+
+.endm
+ declare_var_vector_reg Key0 ,16
+ declare_var_vector_reg Key1 ,17
+ declare_var_vector_reg Key2 ,18
+ declare_var_vector_reg Key3 ,19
+ declare_var_vector_reg Key4 ,20
+ declare_var_vector_reg Key5 ,21
+ declare_var_vector_reg Key6 ,22
+ declare_var_vector_reg Key7 ,23
+ declare_var_vector_reg Key8 ,24
+ declare_var_vector_reg Key9 ,25
+ declare_var_vector_reg Key10,26
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S
new file mode 100644
index 000000000..02add91a2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_128.S
@@ -0,0 +1,165 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define KEY_LEN 128
+#include "gcm_common.S"
+
+#define KEY_REGS 0,1,2,3,4,5,6,7,8
+.macro aes_encrypt_block block:req
+ aes_encrypt_round \block,Key0
+ aes_encrypt_round \block,Key1
+ aes_encrypt_round \block,Key2
+ aes_encrypt_round \block,Key3
+ aes_encrypt_round \block,Key4
+ aes_encrypt_round \block,Key5
+ aes_encrypt_round \block,Key6
+ aes_encrypt_round \block,Key7
+ aes_encrypt_round \block,Key8
+ aese v\block\().16b,vKey9.16b
+ eor v\block\().16b,v\block\().16b,vKey10.16b
+.endm
+
+/*
+ Load Aes Keys to [vKey0..vKey8,vKeyLast0,vKeyLast1]
+ */
+.macro load_aes_keys key_addr:req
+ ld1 {vKey0.4s- vKey3.4s},[\key_addr],64
+ ld1 {vKey4.4s- vKey7.4s},[\key_addr],64
+ ldp qKey8,qKey9,[\key_addr],32
+ ldr qKey10,[\key_addr],15*16 - 128 - 32
+.endm
+
+
+
+/*
+ [low,middle,tmp0,high] +=aadhash * [hashkey0,hashkey0_ext]
+ dat=*dat_adr
+ enc_dat=aes_encrypt(ctr)^dat
+ aadhash=rbit(enc_dat)
+ [hashkey0,hashkey0_ext] = *hashkey_adr
+ dat_adr+=16
+ hashkey_adr+=32
+*/
+.macro aes_gcm_middle is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \
+ hashkey0:req,hashkey0_ext:req,high:req,low:req, \
+ ctr:req,enc_ctr:req,one:req,out_adr:req, \
+ tmp0:req,tmp1:req,left_count:req
+
+ pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d
+ pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d
+ .if \left_count > 1
+ ldr q\hashkey0,[\hashkey_adr],16
+ .endif
+
+ add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr
+
+ rev32 v\enc_ctr\().16b,v\ctr\().16b
+ aes_encrypt_round \enc_ctr,Key0
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+ pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+ .if \left_count > 1
+ ldr q\hashkey0_ext,[\hashkey_adr],16
+ .endif
+ eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+ aes_encrypt_round \enc_ctr,Key1
+ aes_encrypt_round \enc_ctr,Key2
+ eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+ aes_encrypt_round \enc_ctr,Key3
+ ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8
+ ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8
+ aes_encrypt_round \enc_ctr,Key4
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ aes_encrypt_round \enc_ctr,Key5
+ ldr q\aadhash,[\dat_adr],16
+ aes_encrypt_round \enc_ctr,Key6
+ aes_encrypt_round \enc_ctr,Key7
+ aes_encrypt_round \enc_ctr,Key8
+ aese v\enc_ctr\().16b,vKey9.16b
+ eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey10.16b
+ eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b
+ .ifc \is_enc, encrypt
+ rbit v\aadhash\().16b,v\enc_ctr\().16b
+ .endif
+ .ifc \is_enc , decrypt
+ rbit v\aadhash\().16b,v\aadhash\().16b
+ .endif
+ str q\enc_ctr,[\out_adr],16
+.endm
+
+.macro aes_gcm_init is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \
+ hashkey0:req,hashkey0_ext:req, high:req,low:req, \
+ ctr:req,enc_ctr:req,one:req,out_adr:req, \
+ tmp0:req,tmp1:req,left_count:req
+ ldr q\hashkey0,[\hashkey_adr],16
+ add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr
+ rev32 v\enc_ctr\().16b,v\ctr\().16b
+ aes_encrypt_round \enc_ctr,Key0
+ ldr q\hashkey0_ext,[\hashkey_adr],16
+ aes_encrypt_round \enc_ctr,Key1
+ pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d
+ pmull v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d
+
+ .if \left_count > 1
+ ldr q\hashkey0,[\hashkey_adr],16
+ .endif
+ aes_encrypt_round \enc_ctr,Key2
+ pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+ pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+ eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+
+ .if \left_count > 1
+ ldr q\hashkey0_ext,[\hashkey_adr],16
+ .endif
+ aes_encrypt_round \enc_ctr,Key3
+ eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+
+ aes_encrypt_round \enc_ctr,Key4
+ ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 //low
+ ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 //high
+ aes_encrypt_round \enc_ctr,Key5
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ aes_encrypt_round \enc_ctr,Key6
+ ldr q\aadhash,[\dat_adr],16
+ aes_encrypt_round \enc_ctr,Key7
+ aes_encrypt_round \enc_ctr,Key8
+ aese v\enc_ctr\().16b,vKey9.16b
+ eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey10.16b
+ eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b
+ .ifc \is_enc , encrypt
+ rbit v\aadhash\().16b,v\enc_ctr\().16b
+ .endif
+ .ifc \is_enc , decrypt
+ rbit v\aadhash\().16b,v\aadhash\().16b
+ .endif
+ str q\enc_ctr,[\out_adr],16
+.endm
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S
new file mode 100644
index 000000000..fb6a6e94d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_common_256.S
@@ -0,0 +1,181 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define KEY_LEN 256
+#include "gcm_common.S"
+ declare_var_vector_reg Key11,27
+ declare_var_vector_reg Key12,28
+ declare_var_vector_reg Key13,29
+ declare_var_vector_reg Key14,30
+#define KEY_REGS 0,1,2,3,4,5,6,7,8,9,10,11,12
+.macro aes_encrypt_block block:req
+ aes_encrypt_round \block,Key0
+ aes_encrypt_round \block,Key1
+ aes_encrypt_round \block,Key2
+ aes_encrypt_round \block,Key3
+ aes_encrypt_round \block,Key4
+ aes_encrypt_round \block,Key5
+ aes_encrypt_round \block,Key6
+ aes_encrypt_round \block,Key7
+ aes_encrypt_round \block,Key8
+ aes_encrypt_round \block,Key9
+ aes_encrypt_round \block,Key10
+ aes_encrypt_round \block,Key11
+ aes_encrypt_round \block,Key12
+ aese v\block\().16b,vKey13.16b
+ eor v\block\().16b,v\block\().16b,vKey14.16b
+.endm
+
+/*
+ Load Aes Keys to [vKey0..vKey8,vKeyLast0,vKeyLast1]
+ */
+.macro load_aes_keys key_addr:req
+ ld1 { vKey0.4s- vKey3.4s},[\key_addr],64
+ ld1 { vKey4.4s- vKey7.4s},[\key_addr],64
+ ld1 { vKey8.4s- vKey11.4s},[\key_addr],64
+ ld1 {vKey12.4s- vKey14.4s},[\key_addr],48
+.endm
+
+
+
+/*
+ [low,middle,tmp0,high] +=aadhash * [hashkey0,hashkey0_ext]
+ dat=*dat_adr
+ enc_dat=aes_encrypt(ctr)^dat
+ aadhash=rbit(enc_dat)
+ [hashkey0,hashkey0_ext] = *hashkey_adr
+ dat_adr+=16
+ hashkey_adr+=32
+*/
+.macro aes_gcm_middle is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \
+ hashkey0:req,hashkey0_ext:req,high:req,low:req, \
+ ctr:req,enc_ctr:req,one:req,out_adr:req, \
+ tmp0:req,tmp1:req,left_count:req
+
+ pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0\().2d
+ pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0\().1d
+ .if \left_count > 1
+ ldr q\hashkey0,[\hashkey_adr],16
+ .endif
+
+ add v\ctr\().4s,v\ctr\().4s,v\one\().4s //increase ctr
+
+ rev32 v\enc_ctr\().16b,v\ctr\().16b
+ aes_encrypt_round \enc_ctr,Key0
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ pmull v\tmp0\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+ pmull2 v\tmp1\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+ .if \left_count > 1
+ ldr q\hashkey0_ext,[\hashkey_adr],16
+ .endif
+ eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+ aes_encrypt_round \enc_ctr,Key1
+ aes_encrypt_round \enc_ctr,Key2
+ eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+ aes_encrypt_round \enc_ctr,Key3
+ ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8
+ ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8
+ aes_encrypt_round \enc_ctr,Key4
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ aes_encrypt_round \enc_ctr,Key5
+ ldr q\aadhash,[\dat_adr],16
+ aes_encrypt_round \enc_ctr,Key6
+ aes_encrypt_round \enc_ctr,Key7
+ aes_encrypt_round \enc_ctr,Key8
+ aes_encrypt_round \enc_ctr,Key9
+ aes_encrypt_round \enc_ctr,Key10
+ aes_encrypt_round \enc_ctr,Key11
+ aes_encrypt_round \enc_ctr,Key12
+ aese v\enc_ctr\().16b,vKey13.16b
+ eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey14.16b
+ eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b
+ .ifc \is_enc , encrypt
+ rbit v\aadhash\().16b,v\enc_ctr\().16b
+ .endif
+ .ifc \is_enc , decrypt
+ rbit v\aadhash\().16b,v\aadhash\().16b
+ .endif
+ str q\enc_ctr,[\out_adr],16
+.endm
+
+.macro aes_gcm_init is_enc:req,aadhash:req,dat_adr:req,hashkey_adr:req, \
+ hashkey0:req,hashkey0_ext:req, high:req,low:req, \
+ ctr:req,enc_ctr:req,one:req,out_adr:req, \
+ tmp0:req,tmp1:req,left_count:req
+ ldr q\hashkey0,[\hashkey_adr],16
+ add v\ctr\().4s,v\ctr\().4s,v\one\().4s /*increase ctr */
+ rev32 v\enc_ctr\().16b,v\ctr\().16b
+ aes_encrypt_round \enc_ctr,Key0
+ ldr q\hashkey0_ext,[\hashkey_adr],16
+ aes_encrypt_round \enc_ctr,Key1
+ pmull2 v\high\().1q,v\aadhash\().2d,v\hashkey0\().2d
+ pmull v\low\().1q,v\aadhash\().1d,v\hashkey0\().1d
+
+ .if \left_count > 1
+ ldr q\hashkey0,[\hashkey_adr],16
+ .endif
+ aes_encrypt_round \enc_ctr,Key2
+ pmull v\tmp1\().1q,v\aadhash\().1d,v\hashkey0_ext\().1d
+ pmull2 v\tmp0\().1q,v\aadhash\().2d,v\hashkey0_ext\().2d
+ eor v\aadhash\().16b,v\aadhash\().16b,v\aadhash\().16b
+
+ .if \left_count > 1
+ ldr q\hashkey0_ext,[\hashkey_adr],16
+ .endif
+ aes_encrypt_round \enc_ctr,Key3
+ eor v\tmp0\().16b,v\tmp1\().16b,v\tmp0\().16b
+
+ aes_encrypt_round \enc_ctr,Key4
+ ext v\tmp1\().16b,v\aadhash\().16b,v\tmp0\().16b,8 /*low */
+ ext v\tmp0\().16b,v\tmp0\().16b,v\aadhash\().16b,8 /* high */
+ aes_encrypt_round \enc_ctr,Key5
+ eor v\low\().16b,v\low\().16b,v\tmp1\().16b
+ eor v\high\().16b,v\high\().16b,v\tmp0\().16b
+ aes_encrypt_round \enc_ctr,Key6
+ ldr q\aadhash,[\dat_adr],16
+ aes_encrypt_round \enc_ctr,Key7
+ aes_encrypt_round \enc_ctr,Key8
+ aes_encrypt_round \enc_ctr,Key9
+ aes_encrypt_round \enc_ctr,Key10
+ aes_encrypt_round \enc_ctr,Key11
+ aes_encrypt_round \enc_ctr,Key12
+ aese v\enc_ctr\().16b,vKey13.16b
+ eor v\enc_ctr\().16b,v\enc_ctr\().16b,vKey14.16b
+ eor v\enc_ctr\().16b,v\enc_ctr\().16b,v\aadhash\().16b
+ .ifc \is_enc , encrypt
+ rbit v\aadhash\().16b,v\enc_ctr\().16b
+ .endif
+ .ifc \is_enc , decrypt
+ rbit v\aadhash\().16b,v\aadhash\().16b
+ .endif
+ str q\enc_ctr,[\out_adr],16
+.endm
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S
new file mode 100644
index 000000000..927179cfc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_enc_dec.S
@@ -0,0 +1,588 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+/*
+void gist_aes_gcm_dec_##mode( \
+ const struct gcm_key_data *key_data, \
+ struct gcm_context_data *context, \
+ uint8_t *out, \
+ uint8_t const *in, \
+ uint64_t len, \
+ uint8_t *iv, \
+ \
+ uint8_t const *aad, \
+ uint64_t aad_len, \
+ uint8_t *auth_tag, \
+ uint64_t auth_tag_len \
+ \
+ )
+ */
+
+ declare_var_generic_reg key_data ,0
+ declare_var_generic_reg context ,1
+ declare_var_generic_reg out ,2
+ declare_var_generic_reg in ,3
+ declare_var_generic_reg len ,4
+ declare_var_generic_reg iv ,5
+ declare_var_generic_reg aad ,6
+ declare_var_generic_reg aad_len ,7
+
+ declare_var_generic_reg hashkey_base,0
+ declare_var_generic_reg hashkey_addr,5
+ declare_var_generic_reg left_len ,12
+ declare_var_generic_reg aad_left ,13
+ declare_var_generic_reg temp0 ,14
+ declare_var_generic_reg temp1 ,15
+
+ declare_var_generic_reg auth_tag ,0 /* input param */
+ declare_var_generic_reg auth_tag_len,1 /* input param */
+
+
+ declare_var_vector_reg Ctr,0
+ declare_var_vector_reg AadHash,1
+ declare_var_vector_reg HashKey0,2
+ declare_var_vector_reg HashKey0Ext,3
+ declare_var_vector_reg High,4
+ declare_var_vector_reg Low,5
+ declare_var_vector_reg EncCtr,6
+ declare_var_vector_reg Dat0,6
+ declare_var_vector_reg Middle0,7
+
+ declare_var_vector_reg Tmp0,8
+ declare_var_vector_reg Tmp1,9
+ declare_var_vector_reg Zero,10
+ declare_var_vector_reg Poly,11
+ declare_var_vector_reg LeftDat ,12
+ declare_var_vector_reg Len ,13
+ declare_var_vector_reg Tmp2,14
+ declare_var_vector_reg Tmp3,15
+
+ declare_var_vector_reg One,31
+ .set stack_size,64
+ .macro push_stack
+ stp d8, d9,[sp,-stack_size]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+
+ .endm
+
+ .macro pop_stack
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], stack_size
+ .endm
+
+START_FUNC(enc,KEY_LEN,_)
+START_FUNC(enc,KEY_LEN,_nt_)
+ push_stack
+ /*save in_length and aad_length*/
+ stp aad_len,len,[context,AAD_LEN_OFF]
+ load_aes_keys key_data
+ /* Init Consts and IV */
+ mov wtemp1,1
+ eor vOne.16b,vOne.16b,vOne.16b
+ ld1 {vCtr.d}[0],[iv],8
+ eor vZero.16b,vZero.16b,vZero.16b
+ ld1 {vCtr.s}[2],[iv]
+ mov temp0,0x87
+ rev32 vCtr.16b,vCtr.16b /* to cpu order */
+ ins vOne.s[3],wtemp1
+ mov vAadHash.16b,vZero.16b
+ dup vPoly.2d,temp0
+ ins vCtr.s[3],wtemp1 /* Initial Ctr and Orig IV */
+
+
+ and left_len,aad_len,0xf
+ cbz aad_len,24f
+ lsr aad_len,aad_len,4
+ /* Read small data */
+ cbz left_len,2f /* aad_len >= 16,skip */
+ add aad_left,aad,aad_len,lsl 4
+ read_small_data_start LeftDat,aad_left,left_len,temp0,Tmp0
+ cbnz left_len,1f /* aad_len & 0xf != 0 */
+2:
+ cbz aad_len,1f /* aad_len <16 skip*/
+ /* left_len == 0 && aad_len !=0 */
+ sub aad_len,aad_len,1
+ /* leftDat = aad[-1] */
+ ldr qLeftDat,[aad,aad_len,lsl 4]
+1:
+ cbnz aad_len,1f /* aad_len >16,skip */
+ rbit vAadHash.16b,vLeftDat.16b
+ b 24f /* aad_len <=16, skip aadhash caculate */
+1:
+ /* aad_len > 16 */
+ ldr qAadHash,[aad],16
+ rbit vAadHash.16b,vAadHash.16b
+ sub aad_len,aad_len,1
+
+1:
+ /* loop ghash_block */
+ cmp aad_len,HASHKEY_TOTAL_NUM - 1
+ bls 1f // break loop
+ sub aad_len,aad_len,HASHKEY_TOTAL_NUM
+ ghash_block_n HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base, \
+ HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \
+ Tmp0,Tmp1
+ b 1b /* back to loop start */
+1:
+ cbnz aad_len,1f /* left aad_len >32,skip */
+ ldp qHashKey0,qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32]
+ ghash_block_reg AadHash,LeftDat, \
+ HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \
+ Tmp0
+ b 24f /* left aad_len <=32,skip below check */
+1:
+ mov temp0,HASHKEY_TOTAL_NUM - 1
+ sub temp0,temp0,aad_len
+ add hashkey_addr,hashkey_base,temp0,lsl 5
+
+ ghash_mult_init_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Middle0,Tmp0,Dat0,2 /* load next hash */
+ sub aad_len,aad_len,1
+
+1:
+ cbz aad_len,1f
+ ghash_mult_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Middle0,Tmp0,Tmp1,Dat0, 2
+
+ sub aad_len,aad_len,1
+ b 1b
+1:
+ ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1
+ rbit vAadHash.16b, vLeftDat.16b
+ ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+24:
+
+ /* Enc/Dec loop */
+ and left_len,len,15
+ cbz len,24f
+ lsr len,len,4
+1:
+ /* loop aes gcm enc/dec loop */
+ cmp len,HASHKEY_TOTAL_NUM - 1
+ bls 1f // break loop
+ sub len,len,HASHKEY_TOTAL_NUM
+ aes_gcm_n_round encrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \
+ HashKey0,HashKey0Ext,High,Low,Poly, \
+ Ctr,EncCtr,One,out,Tmp0,Tmp1
+ b 1b /* back to loop start */
+1:
+ cbz len,24f /* left len == 0 */
+ mov temp0,HASHKEY_TOTAL_NUM
+ sub temp0,temp0,len
+ add hashkey_addr,hashkey_base,temp0,lsl 5
+
+ sub len,len,1
+ aes_gcm_init encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */
+ cbz len,2f
+ sub len,len,1
+1:
+
+ cbz len,1f
+ aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */
+ sub len,len,1
+ b 1b
+1:
+ aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* load next hash */
+2:
+ poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly
+24:
+ /* complete part */
+ cmp left_len,0
+ movi vHigh.16b,0
+ mov temp0,HASHKEY_TOTAL_NUM-3
+ movi vLow.16b,0
+ cinc hashkey_addr,temp0,eq
+ movi vMiddle0.16b,0
+ add hashkey_addr,hashkey_base,hashkey_addr,lsl 5
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr],32
+ beq 2f
+ read_small_data_start LeftDat,in,left_len,temp0,Tmp0
+ add vCtr.4s,vCtr.4s,vOne.4s
+ rev32 vEncCtr.16b,vCtr.16b
+ aes_encrypt_round EncCtr,Key0
+ pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round EncCtr,Key1
+ pmull vLow.1q ,vAadHash.1d,vHashKey0.1d
+ aes_encrypt_round EncCtr,Key2
+ ldr qHashKey0,[hashkey_addr],16
+ aes_encrypt_round EncCtr,Key3
+ pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+ aes_encrypt_round EncCtr,Key4
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d
+ aes_encrypt_round EncCtr,Key5
+ ldr qHashKey0Ext,[hashkey_addr],16
+ aes_encrypt_round EncCtr,Key6
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ aes_encrypt_round EncCtr,Key7
+ aes_encrypt_round EncCtr,Key8
+#if KEY_LEN==256
+ aes_encrypt_round EncCtr,Key9
+ aes_encrypt_round EncCtr,Key10
+ aes_encrypt_round EncCtr,Key11
+ aes_encrypt_round EncCtr,Key12
+ aese vEncCtr.16b,vKey13.16b
+ eor vEncCtr.16b,vEncCtr.16b,vKey14.16b
+#else
+ aese vEncCtr.16b,vKey9.16b
+ eor vEncCtr.16b,vEncCtr.16b,vKey10.16b
+#endif
+ eor vEncCtr.16b,vEncCtr.16b,vLeftDat.16b
+ write_small_data_start EncCtr,out,left_len,temp0,Tmp0
+ clear_small_data EncCtr,Zero,left_len,temp0,Tmp0
+ rbit vAadHash.16b,vEncCtr.16b
+2:
+
+ ldr qLen,[context,AAD_LEN_OFF] /* Len */
+ mov wtemp0,1 /* Ek */
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d /* auth_dat * HashKey[Total-2] */
+ shl vLen.2d,vLen.2d,3 /* Len */
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* auth_dat * HashKey[Total-2] */
+ rev64 vLen.16b,vLen.16b /* Len */
+ ins vCtr.4s[3],wtemp0 /* Ek */
+ ldr qHashKey0,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */
+ pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* auth_dat * HashKey[Total-2] */
+ rev32 vEncCtr.16b,vCtr.16b /* Ek */
+ eor vHigh.16b,vHigh.16b,vTmp0.16b /* auth_dat * HashKey[Total-2] */
+ pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* auth_dat * HashKey[Total-2] */
+ rbit vAadHash.16b,vLen.16b /* Len */
+
+ aes_encrypt_round EncCtr,Key0 /* Ek */
+ eor vLow.16b,vLow.16b,vTmp1.16b /* auth_dat * HashKey[Total-2] */
+ aes_encrypt_round EncCtr,Key1 /* Ek */
+ ldr qHashKey0Ext,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */
+ aes_encrypt_round EncCtr,Key2 /* Ek */
+ eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* auth_dat * HashKey[Total-2] */
+ aes_encrypt_round EncCtr,Key3 /* Ek */
+ eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* auth_dat * HashKey[Total-2] */
+ aes_encrypt_round EncCtr,Key4 /* Ek */
+
+ pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d /* Len * HashKey[Total-1] */
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* Len * HashKey[Total-1] */
+ aes_encrypt_round EncCtr,Key5 /* Ek */
+ aes_encrypt_round EncCtr,Key6 /* Ek */
+ pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* Len * HashKey[Total-1] */
+ aes_encrypt_round EncCtr,Key7 /* Ek */
+ eor vHigh.16b,vHigh.16b,vTmp0.16b /* Len * HashKey[Total-1] */
+ pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* Len * HashKey[Total-1] */
+ aes_encrypt_round EncCtr,Key8 /* Ek */
+ eor vLow.16b,vLow.16b,vTmp1.16b /* Len * HashKey[Total-1] */
+#if KEY_LEN==256
+ aes_encrypt_round EncCtr,Key9 /* Ek */
+ aes_encrypt_round EncCtr,Key10 /* Ek */
+ aes_encrypt_round EncCtr,Key11 /* Ek */
+ aes_encrypt_round EncCtr,Key12 /* Ek */
+ aese vEncCtr.16b,vKey13.16b /* Ek */
+ eor vEncCtr.16b,vEncCtr.16b,vKey14.16b /* Ek */
+#else
+ aese vEncCtr.16b,vKey9.16b /* Ek */
+ eor vEncCtr.16b,vEncCtr.16b,vKey10.16b /* Ek */
+#endif
+ eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* Len * HashKey[Total-1] */
+ eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* Len * HashKey[Total-1] */
+ rbit vAadHash.16b,vEncCtr.16b /* Aad */
+
+ ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+ ldp auth_tag,auth_tag_len,[sp,stack_size] /* Adjust here : TODO TBD */
+ rbit vAadHash.16b,vAadHash.16b /* Aad */
+
+
+ /* output auth_tag */
+ cmp auth_tag_len,16
+ bne 1f
+ /* most likely auth_tag_len=16 */
+ str qAadHash,[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=12 */
+ cmp auth_tag_len,12
+ bne 1f
+ str dAadHash,[auth_tag],8
+ st1 {vAadHash.s}[2],[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=8 */
+ str dAadHash,[auth_tag]
+ pop_stack
+ ret
+END_FUNC(enc,KEY_LEN,_)
+END_FUNC(enc,KEY_LEN,_nt_)
+
+
+START_FUNC(dec,KEY_LEN,_)
+START_FUNC(dec,KEY_LEN,_nt_)
+ push_stack
+ /* save in_length and aad_length */
+ stp aad_len,len,[context,AAD_LEN_OFF]
+ load_aes_keys key_data
+ /* Init Consts and IV */
+ mov wtemp1,1
+ eor vOne.16b,vOne.16b,vOne.16b
+ ld1 {vCtr.d}[0],[iv],8
+ eor vZero.16b,vZero.16b,vZero.16b
+ ld1 {vCtr.s}[2],[iv]
+ mov temp0,0x87
+ rev32 vCtr.16b,vCtr.16b /* to cpu order */
+ mov vAadHash.16b,vZero.16b
+ ins vOne.s[3],wtemp1
+ dup vPoly.2d,temp0
+ ins vCtr.s[3],wtemp1 /* Initial Ctr and Orig IV */
+
+ ldp qHashKey0,qHashKey0Ext,[hashkey_base]
+ and left_len,aad_len,0xf
+ cbz aad_len,24f
+ lsr aad_len,aad_len,4
+ /* Read small data */
+ cbz left_len,2f /* aad_len >= 16,skip */
+ add aad_left,aad,aad_len,lsl 4
+ read_small_data_start LeftDat,aad_left,left_len,temp0,Tmp0
+ cbnz left_len,1f /* aad_len & 0xf != 0 */
+2:
+ cbz aad_len,1f /* aad_len <16 skip */
+ /* left_len == 0 && aad_len !=0 */
+ sub aad_len,aad_len,1
+ /* leftDat = aad[-1] */
+ ldr qLeftDat,[aad,aad_len,lsl 4]
+1:
+ cbnz aad_len,1f /* aad_len >16,skip */
+ rbit vAadHash.16b,vLeftDat.16b
+ b 24f /* aad_len <=16, skip aadhash caculate */
+1:
+ /* aad_len > 16 */
+ ldr qAadHash,[aad],16
+ rbit vAadHash.16b,vAadHash.16b
+ sub aad_len,aad_len,1
+
+1:
+ /** loop ghash_block */
+ cmp aad_len,HASHKEY_TOTAL_NUM - 1
+ bls 1f /* break loop */
+ sub aad_len,aad_len,HASHKEY_TOTAL_NUM
+ ghash_block_n HASHKEY_TOTAL_NUM,AadHash,Dat0,aad,hashkey_addr,hashkey_base, \
+ HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \
+ Tmp0,Tmp1
+ b 1b /* back to loop start */
+1:
+ cbnz aad_len,1f /* left aad_len >32,skip */
+ ldp qHashKey0,qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32]
+ ghash_block_reg AadHash,LeftDat, \
+ HashKey0,HashKey0Ext,High,Low,Middle0,Zero,Poly , \
+ Tmp0
+ b 24f /* left aad_len <=32,skip below check */
+1:
+ mov temp0,HASHKEY_TOTAL_NUM - 1
+ sub temp0,temp0,aad_len
+ add hashkey_addr,hashkey_base,temp0,lsl 5
+
+ ghash_mult_init_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Middle0,Tmp0,Dat0,2 /* load next hash */
+ sub aad_len,aad_len,1
+
+1:
+ cbz aad_len,1f
+ ghash_mult_round AadHash,aad,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Middle0,Tmp0,Tmp1,Dat0, 2
+
+ sub aad_len,aad_len,1
+ b 1b
+1:
+ ghash_mult_round_noload AadHash,HashKey0,HashKey0Ext,High,Low,Middle0,Tmp0,Tmp1
+ rbit vAadHash.16b, vLeftDat.16b
+ ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+24:
+
+
+ /* Enc/Dec loop */
+ and left_len,len,15
+ cbz len,24f
+ lsr len,len,4
+1:
+ /* loop aes gcm enc/dec loop */
+ cmp len,HASHKEY_TOTAL_NUM - 1
+ bls 1f // break loop
+ sub len,len,HASHKEY_TOTAL_NUM
+ aes_gcm_n_round decrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \
+ HashKey0,HashKey0Ext,High,Low,Poly, \
+ Ctr,EncCtr,One,out,Tmp0,Tmp1
+ b 1b /* back to loop start */
+1:
+ cbz len,24f /* left len == 0 */
+ mov temp0,HASHKEY_TOTAL_NUM
+ sub temp0,temp0,len
+ add hashkey_addr,hashkey_base,temp0,lsl 5
+
+ sub len,len,1
+ aes_gcm_init decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */
+ cbz len,2f
+ sub len,len,1
+1:
+
+ cbz len,1f
+ aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */
+ sub len,len,1
+ b 1b
+1:
+ aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* load next hash */
+2:
+ poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly
+24:
+ /* complete part */
+ cmp left_len,0
+ movi vHigh.16b,0
+ mov temp0,21
+ movi vLow.16b,0
+ cinc hashkey_addr,temp0,eq
+ movi vMiddle0.16b,0
+ add hashkey_addr,hashkey_base,hashkey_addr,lsl 5
+ ldp qHashKey0,qHashKey0Ext,[hashkey_addr],32
+ beq 2f
+ read_small_data_start LeftDat,in,left_len,temp0,Tmp0
+ add vCtr.4s,vCtr.4s,vOne.4s
+ rev32 vEncCtr.16b,vCtr.16b
+ aes_encrypt_round EncCtr,Key0
+ pmull2 vHigh.1q,vAadHash.2d,vHashKey0.2d
+ aes_encrypt_round EncCtr,Key1
+ pmull vLow.1q ,vAadHash.1d,vHashKey0.1d
+ aes_encrypt_round EncCtr,Key2
+ ldr qHashKey0,[hashkey_addr],16
+ aes_encrypt_round EncCtr,Key3
+ pmull vMiddle0.1q,vAadHash.1d,vHashKey0Ext.1d
+ aes_encrypt_round EncCtr,Key4
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0Ext.2d
+ aes_encrypt_round EncCtr,Key5
+ ldr qHashKey0Ext,[hashkey_addr],16
+ aes_encrypt_round EncCtr,Key6
+ eor vMiddle0.16b,vMiddle0.16b,vTmp0.16b
+ aes_encrypt_round EncCtr,Key7
+ aes_encrypt_round EncCtr,Key8
+#if KEY_LEN==256
+ aes_encrypt_round EncCtr,Key9
+ aes_encrypt_round EncCtr,Key10
+ aes_encrypt_round EncCtr,Key11
+ aes_encrypt_round EncCtr,Key12
+ aese vEncCtr.16b,vKey13.16b
+ eor vEncCtr.16b,vEncCtr.16b,vKey14.16b
+ eor vEncCtr.16b,vEncCtr.16b,vLeftDat.16b
+#endif
+#if KEY_LEN==128
+ aese vEncCtr.16b,vKey9.16b
+ eor vEncCtr.16b,vEncCtr.16b,vKey10.16b
+ eor vEncCtr.16b,vEncCtr.16b,vLeftDat.16b
+#endif
+ write_small_data_start EncCtr,out,left_len,temp0,Tmp0
+ rbit vAadHash.16b,vLeftDat.16b
+
+2:
+
+ ldr qLen,[context,AAD_LEN_OFF] /* Len */
+ mov wtemp0,1 /* Ek */
+ pmull2 vTmp0.1q ,vAadHash.2d,vHashKey0.2d /* auth_dat * HashKey[Total-2] */
+ shl vLen.2d,vLen.2d,3 /* Len */
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* auth_dat * HashKey[Total-2] */
+ rev64 vLen.16b,vLen.16b /* Len */
+ ins vCtr.4s[3],wtemp0 /* Ek */
+ ldr qHashKey0,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */
+ pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* auth_dat * HashKey[Total-2] */
+ rev32 vEncCtr.16b,vCtr.16b /* Ek */
+ eor vHigh.16b,vHigh.16b,vTmp0.16b /* auth_dat * HashKey[Total-2] */
+ pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* auth_dat * HashKey[Total-2] */
+ rbit vAadHash.16b,vLen.16b /* Len */
+
+ aes_encrypt_round EncCtr,Key0 /* Ek */
+ eor vLow.16b,vLow.16b,vTmp1.16b /* auth_dat * HashKey[Total-2] */
+ aes_encrypt_round EncCtr,Key1 /* Ek */
+ ldr qHashKey0Ext,[hashkey_addr],16 /* auth_dat * HashKey[Total-2] */
+ aes_encrypt_round EncCtr,Key2 /* Ek */
+ eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* auth_dat * HashKey[Total-2] */
+ aes_encrypt_round EncCtr,Key3 /* Ek */
+ eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* auth_dat * HashKey[Total-2] */
+ aes_encrypt_round EncCtr,Key4 /* Ek */
+
+ pmull2 vTmp0.1q,vAadHash.2d,vHashKey0.2d /* Len * HashKey[Total-1] */
+ pmull vTmp1.1q ,vAadHash.1d,vHashKey0.1d /* Len * HashKey[Total-1] */
+ aes_encrypt_round EncCtr,Key5 /* Ek */
+ aes_encrypt_round EncCtr,Key6 /* Ek */
+ pmull vTmp2.1q,vAadHash.1d,vHashKey0Ext.1d /* Len * HashKey[Total-1] */
+ aes_encrypt_round EncCtr,Key7 /* Ek */
+ eor vHigh.16b,vHigh.16b,vTmp0.16b /* Len * HashKey[Total-1] */
+ pmull2 vTmp3.1q ,vAadHash.2d,vHashKey0Ext.2d /* Len * HashKey[Total-1] */
+ aes_encrypt_round EncCtr,Key8 /* Ek */
+ eor vLow.16b,vLow.16b,vTmp1.16b /* Len * HashKey[Total-1] */
+#if KEY_LEN==256
+ aes_encrypt_round EncCtr,Key9 /* Ek */
+ aes_encrypt_round EncCtr,Key10 /* Ek */
+ aes_encrypt_round EncCtr,Key11 /* Ek */
+ aes_encrypt_round EncCtr,Key12 /* Ek */
+ aese vEncCtr.16b,vKey13.16b /* Ek */
+ eor vEncCtr.16b,vEncCtr.16b,vKey14.16b /* Ek */
+#else
+ aese vEncCtr.16b,vKey9.16b /* Ek */
+ eor vEncCtr.16b,vEncCtr.16b,vKey10.16b /* Ek */
+#endif
+ eor vMiddle0.16b,vMiddle0.16b,vTmp2.16b /* Len * HashKey[Total-1] */
+ eor vMiddle0.16b,vMiddle0.16b,vTmp3.16b /* Len * HashKey[Total-1] */
+ rbit vAadHash.16b,vEncCtr.16b /* Aad */
+
+ ghash_mult_final_round AadHash,High,Low,Middle0,Tmp0,Zero,Poly
+
+ ldp auth_tag,auth_tag_len,[sp,stack_size] /* Adjust here : TODO TBD */
+ rbit vAadHash.16b,vAadHash.16b /* Aad */
+
+
+ /* output auth_tag */
+ cmp auth_tag_len,16
+ bne 1f
+ /* most likely auth_tag_len=16 */
+ str qAadHash,[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=12 */
+ cmp auth_tag_len,12
+ bne 1f
+ str dAadHash,[auth_tag],8
+ st1 {vAadHash.s}[2],[auth_tag]
+ pop_stack
+ ret
+1: /* auth_tag_len=8 */
+ str dAadHash,[auth_tag]
+ pop_stack
+ ret
+END_FUNC(dec,KEY_LEN,_)
+END_FUNC(dec,KEY_LEN,_nt_)
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S
new file mode 100644
index 000000000..b5433a1df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_multibinary_aarch64.S
@@ -0,0 +1,58 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface aes_gcm_enc_128
+mbin_interface aes_gcm_dec_128
+mbin_interface aes_gcm_precomp_128
+mbin_interface aes_gcm_enc_256
+mbin_interface aes_gcm_dec_256
+mbin_interface aes_gcm_precomp_256
+
+
+mbin_interface aes_gcm_enc_128_update
+mbin_interface aes_gcm_enc_128_finalize
+mbin_interface aes_gcm_dec_128_update
+mbin_interface aes_gcm_dec_128_finalize
+mbin_interface aes_gcm_enc_256_update
+mbin_interface aes_gcm_enc_256_finalize
+mbin_interface aes_gcm_dec_256_update
+mbin_interface aes_gcm_dec_256_finalize
+
+mbin_interface aes_gcm_init_256
+mbin_interface aes_gcm_init_128
+mbin_interface aes_gcm_enc_128_nt
+mbin_interface aes_gcm_enc_128_update_nt
+mbin_interface aes_gcm_dec_128_nt
+mbin_interface aes_gcm_dec_128_update_nt
+mbin_interface aes_gcm_enc_256_nt
+mbin_interface aes_gcm_enc_256_update_nt
+mbin_interface aes_gcm_dec_256_nt
+mbin_interface aes_gcm_dec_256_update_nt
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S
new file mode 100644
index 000000000..e555c9798
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_precomp.S
@@ -0,0 +1,83 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+/*
+void aes_gcm_precomp(struct gcm_key_data *key_data);
+*/
+ declare_var_generic_reg key_data ,0
+ declare_var_generic_reg temp0 ,1
+ declare_var_generic_reg hashkey_base,0
+ declare_var_generic_reg hashkey_addr,1
+
+ declare_var_vector_reg Low ,0
+ declare_var_vector_reg Middle0 ,1
+ declare_var_vector_reg Middle1 ,2
+ declare_var_vector_reg High ,3
+ declare_var_vector_reg HashKeyIter ,4
+ declare_var_vector_reg HashKey ,5
+ declare_var_vector_reg HashKeyExt ,6
+ declare_var_vector_reg Poly ,7
+ declare_var_vector_reg Zero ,31
+
+START_FUNC(precomp,KEY_LEN,_)
+ load_aes_keys key_data
+ mov temp0,0x87
+ eor vZero.16b,vZero.16b,vZero.16b
+ eor vHashKey.16b,vHashKey.16b,vHashKey.16b
+ dup vPoly.2d,temp0
+ aes_encrypt_block HashKey
+ add hashkey_addr,hashkey_base,(HASHKEY_TOTAL_NUM-1)*32
+ rbit vHashKey.16b,vHashKey.16b
+ ext vHashKeyExt.16b,vHashKey.16b,vHashKey.16b,8
+ mov vHashKeyIter.16b,vHashKey.16b
+ stp qHashKey,qHashKeyExt,[hashkey_addr],-32
+
+1:
+ pmull vMiddle0.1q,vHashKeyIter.1d,vHashKeyExt.1d
+ pmull2 vMiddle1.1q,vHashKeyIter.2d,vHashKeyExt.2d
+ pmull vLow.1q ,vHashKeyIter.1d,vHashKey.1d
+ eor vMiddle0.16b,vMiddle0.16b,vMiddle1.16b
+ pmull2 vHigh.1q ,vHashKeyIter.2d,vHashKey.2d
+ ext vMiddle1.16b,vMiddle0.16b,vZero.16b,8 //high
+ ext vMiddle0.16b,vZero.16b,vMiddle0.16b,8 //low
+ eor vHigh.16b ,vHigh.16b,vMiddle1.16b
+ eor vLow.16b ,vLow.16b ,vMiddle0.16b
+ pmull2 vMiddle0.1q ,vHigh.2d ,vPoly.2d
+ ext vMiddle1.16b,vMiddle0.16b,vZero.16b,8 //high
+ ext vMiddle0.16b,vZero.16b,vMiddle0.16b,8 //low
+ eor vHigh.16b ,vHigh.16b,vMiddle1.16b
+ eor vLow.16b ,vLow.16b ,vMiddle0.16b
+ pmull vMiddle0.1q ,vHigh.1d ,vPoly.1d
+ eor vHashKeyIter.16b,vLow.16b,vMiddle0.16b
+ ext vLow.16b,vHashKeyIter.16b,vHashKeyIter.16b,8
+ stp qHashKeyIter,qLow,[hashkey_addr],-32
+ cmp hashkey_addr,hashkey_base
+ bcs 1b
+
+ ret
+END_FUNC(precomp,KEY_LEN,_)
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S
new file mode 100644
index 000000000..d47c52212
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/gcm_update.S
@@ -0,0 +1,277 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+/*
+void gist_aes_gcm_dec_update_##mode( \
+ const struct gcm_key_data *key_data, \
+ struct gcm_context_data *context, \
+ uint8_t *out, \
+ const uint8_t *in, \
+ uint64_t len \
+ )
+ */
+
+ declare_var_generic_reg key_data ,0
+ declare_var_generic_reg context ,1
+ declare_var_generic_reg out ,2
+ declare_var_generic_reg in ,3
+ declare_var_generic_reg len ,4
+ declare_var_generic_reg partial_block_length,5
+ declare_var_generic_reg blocks ,5
+ declare_var_generic_reg hashkey_base,0
+ declare_var_generic_reg hashkey_addr,6
+ declare_var_generic_reg temp0 ,14
+ declare_var_generic_reg temp1 ,15
+ declare_var_generic_reg temp2 ,13
+
+
+
+ declare_var_vector_reg Ctr,0
+ declare_var_vector_reg AadHash,1
+ declare_var_vector_reg HashKey0,2
+ declare_var_vector_reg HashKey0Ext,3
+ declare_var_vector_reg High,4
+ declare_var_vector_reg Low,5
+ declare_var_vector_reg EncCtr,6
+ declare_var_vector_reg Middle,7
+
+ declare_var_vector_reg Tmp0,8
+ declare_var_vector_reg Tmp1,9
+ declare_var_vector_reg Zero,10
+ declare_var_vector_reg Poly,11
+ declare_var_vector_reg PartialBlock ,12
+ declare_var_vector_reg One,31
+ .set stack_size,48
+ .macro push_stack
+ stp d8, d9, [sp,-stack_size]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+
+ .endm
+
+ .macro pop_stack
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d8, d9, [sp], stack_size
+ .endm
+/*
+ 20:exit_without_popstack
+ 21:start_of_mainloop
+ 22:exit_with_popstack
+ 23:partial_block_start
+ */
+START_FUNC(enc,KEY_LEN,_update_)
+START_FUNC(enc,KEY_LEN,_update_nt_)
+ ldr temp0,[context,IN_LENGTH_OFF] /*load in_length */
+ ldr partial_block_length,[context,PARTIAL_BLOCK_LENGTH_OFF]
+ ldr qAadHash,[context]
+ cbz len,20f /** if(len==0)return; exit_without_popstack*/
+ push_stack
+ add temp0,temp0,len /* temp0=temp0+len */
+ load_aes_keys key_data
+ str temp0,[context,IN_LENGTH_OFF] /* save in_length */
+ /* Init Consts and IV */
+ ldr qCtr,[context,CTR_OFF]
+ mov wtemp1,1
+ eor vOne.16b,vOne.16b,vOne.16b
+ mov temp0,0x87
+ eor vZero.16b,vZero.16b,vZero.16b
+ ins vOne.s[3],wtemp1
+ dup vPoly.2d,temp0
+ cbnz partial_block_length,23f /* if(partial_block_length!=0) not normal case*/
+21: /* start_of_mainloop */
+ cbz len,24f
+ lsr blocks,len,4
+ cmp blocks,HASHKEY_TOTAL_NUM - 1
+ and len,len,0xf
+ /* loop aes gcm enc/dec loop */
+ bls 2f /* skip loop */
+1:
+ sub blocks,blocks,HASHKEY_TOTAL_NUM
+ cmp blocks,HASHKEY_TOTAL_NUM - 1
+ aes_gcm_n_round encrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \
+ HashKey0,HashKey0Ext,High,Low,Poly, \
+ Ctr,EncCtr,One,out,Tmp0,Tmp1
+ bhi 1b /* back to loop start */
+2:
+ cbz blocks,4f // left blocks == 0
+ /* -(blocks - HASHKEY_TOTAL_NUM) */
+ sub temp0,blocks,HASHKEY_TOTAL_NUM
+ neg temp0,temp0
+ sub blocks,blocks,1
+ add hashkey_addr,hashkey_base,temp0,lsl 5
+
+ aes_gcm_init encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */
+ cbz blocks,3f /* origin_blocks == 1 */
+ sub blocks,blocks,1
+
+ cbz blocks,2f /* origin_blocks == 2 */
+1:
+ sub blocks,blocks,1
+ aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */
+ cbnz blocks,1b
+2:
+ aes_gcm_middle encrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* not load next hash */
+3:
+ poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly
+4:
+ str qAadHash,[context]
+ str qCtr,[context,CTR_OFF]
+ cbnz len,24f
+22: /* exit_with_popstack */
+ pop_stack
+20: /* exit_without_popstack */
+ ret
+23: /* partial_block_start */
+
+ generic_partial_block_start encrypt,len,in,out,context, \
+ temp2,partial_block_length,temp0,temp1,hashkey_addr
+ cbnz partial_block_length,22b
+ ldr qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32+16]
+ ldr qHashKey0 ,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32]
+ ldr qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF]
+ ghash_block_reg AadHash,PartialBlock,HashKey0,HashKey0Ext, \
+ High,Low,Middle,Zero,Poly,Tmp0
+ str qAadHash,[context]
+ cbz len,4b
+ cmp len,15
+ bhi 21b
+24: /*partial_block_end */
+ add vCtr.4s,vCtr.4s,vOne.4s
+ read_small_data_start PartialBlock,in,len,temp0,Tmp0
+ rev32 vEncCtr.16b,vCtr.16b
+ str qCtr,[context,CTR_OFF]
+ aes_encrypt_block EncCtr
+ eor vPartialBlock.16b,vPartialBlock.16b,vEncCtr.16b
+ str qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF]
+ write_small_data_start PartialBlock,out,len,temp0,Tmp0
+ str len,[context,PARTIAL_BLOCK_LENGTH_OFF]
+ pop_stack
+ ret
+
+END_FUNC(enc,KEY_LEN,_update_)
+END_FUNC(enc,KEY_LEN,_update_nt_)
+
+
+START_FUNC(dec,KEY_LEN,_update_)
+START_FUNC(dec,KEY_LEN,_update_nt_)
+ ldr temp0,[context,IN_LENGTH_OFF] /*load in_length */
+ ldr partial_block_length,[context,PARTIAL_BLOCK_LENGTH_OFF]
+ ldr qAadHash,[context]
+ cbz len,20f /** if(len==0)return; exit_without_popstack*/
+ push_stack
+ add temp0,temp0,len /* temp0=temp0+len */
+ load_aes_keys key_data
+ str temp0,[context,IN_LENGTH_OFF] /* save in_length */
+ /* Init Consts and IV */
+ ldr qCtr,[context,CTR_OFF]
+ mov wtemp1,1
+ eor vOne.16b,vOne.16b,vOne.16b
+ mov temp0,0x87
+ eor vZero.16b,vZero.16b,vZero.16b
+ ins vOne.s[3],wtemp1
+ dup vPoly.2d,temp0
+ cbnz partial_block_length,23f /* if(partial_block_length!=0) not normal case*/
+21: /* start_of_mainloop */
+ cbz len,24f
+ lsr blocks,len,4
+ cmp blocks,HASHKEY_TOTAL_NUM - 1
+ and len,len,0xf
+ /** loop aes gcm enc/dec loop */
+ bls 2f /* skip loop */
+1:
+ sub blocks,blocks,HASHKEY_TOTAL_NUM
+ cmp blocks,HASHKEY_TOTAL_NUM - 1
+ aes_gcm_n_round decrypt,HASHKEY_TOTAL_NUM,AadHash,in,hashkey_addr,hashkey_base, \
+ HashKey0,HashKey0Ext,High,Low,Poly, \
+ Ctr,EncCtr,One,out,Tmp0,Tmp1
+ bhi 1b /* back to loop start */
+2:
+ cbz blocks,4f /* left blocks == 0 */
+ /* -(blocks - HASHKEY_TOTAL_NUM) */
+ sub temp0,blocks,HASHKEY_TOTAL_NUM
+ neg temp0,temp0
+ sub blocks,blocks,1
+ add hashkey_addr,hashkey_base,temp0,lsl 5
+
+ aes_gcm_init decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 // load next hash
+ cbz blocks,3f /* origin_blocks == 1 */
+ sub blocks,blocks,1
+
+ cbz blocks,2f /* origin_blocks == 2 */
+1:
+ sub blocks,blocks,1
+ aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,2 /* load next hash */
+ cbnz blocks,1b
+2:
+ aes_gcm_middle decrypt,AadHash,in,hashkey_addr,HashKey0,HashKey0Ext, \
+ High,Low,Ctr,EncCtr,One,out,Tmp0,Tmp1,1 /* not load next hash */
+3:
+ poly_mult_final_x2 AadHash,High,Low,Tmp0,Tmp1,Poly
+4:
+ str qAadHash,[context]
+ str qCtr,[context,CTR_OFF]
+ cbnz len,24f
+22: /* exit_with_popstack */
+ pop_stack
+20: /* exit_without_popstack */
+ ret
+23: /* partial_block_start */
+
+ generic_partial_block_start decrypt,len,in,out,context, \
+ temp2,partial_block_length,temp0,temp1,hashkey_addr
+ cbnz partial_block_length,22b
+ ldr qHashKey0Ext,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32+16]
+ ldr qHashKey0 ,[hashkey_base,(HASHKEY_TOTAL_NUM-1)*32]
+ ldr qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF]
+ ghash_block_reg AadHash,PartialBlock,HashKey0,HashKey0Ext, \
+ High,Low,Middle,Zero,Poly,Tmp0
+ str qAadHash,[context]
+ cbz len,4b
+ cmp len,15
+ bhi 21b
+24: /* partial_block_end */
+ add vCtr.4s,vCtr.4s,vOne.4s
+ read_small_data_start PartialBlock,in,len,temp0,Tmp0
+ rev32 vEncCtr.16b,vCtr.16b
+ str qCtr,[context,CTR_OFF]
+ aes_encrypt_block EncCtr
+ eor vEncCtr.16b,vPartialBlock.16b,vEncCtr.16b
+ tbx_small_data_start EncCtr,PartialBlock,len,temp0,Tmp0
+ write_small_data_start EncCtr,out,len,temp0,Tmp0
+ str qPartialBlock,[context,PARTIAL_BLOCK_ENC_KEY_OFF]
+ str len,[context,PARTIAL_BLOCK_LENGTH_OFF]
+ pop_stack
+ ret
+END_FUNC(dec,KEY_LEN,_update_)
+END_FUNC(dec,KEY_LEN,_update_nt_)
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S
new file mode 100644
index 000000000..4a3e990c3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_128_aarch64_aes.S
@@ -0,0 +1,134 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.arch armv8-a+crypto
+
+ .text
+/*
+Macros
+*/
+#define NUM_ROUNDS(a) (7+(a)/32)
+.macro declare_var_vector_reg name:req,reg:req
+ q\name .req q\reg
+ v\name .req v\reg
+ s\name .req s\reg
+.endm
+.macro round_128 off:req,rcon:req
+ .if \off == 0
+ ldp w_tmp2,w_tmp3,[key,8]
+ ldp w_tmp0,w_tmp1,[key]
+ movi vzero.4s,0
+ dup vsrc.4s,w_tmp3
+ stp w_tmp2,w_tmp3,[exp_key_enc,8]
+ stp w_tmp0,w_tmp1,[exp_key_enc]
+ .endif
+ mov w0,\rcon
+ mov vdest.16b,vzero.16b
+ aese vdest.16b,vsrc.16b
+ mov w_tmp4,vdest.s[0]
+ eor w_tmp0,w_tmp0,w0
+ eor w_tmp0,w_tmp0,w_tmp4,ror 8
+ eor w_tmp1,w_tmp0,w_tmp1
+ eor w_tmp2,w_tmp1,w_tmp2
+ eor w_tmp3,w_tmp2,w_tmp3
+ stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*\off+KEY_LEN]
+ stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*\off+8+KEY_LEN]
+ .if \off != 10
+ dup vsrc.4s,w_tmp3
+ .endif
+.endm
+.macro export_dec_key rounds:req,enc_key:req,dec_key:req
+ ldr q0,[\enc_key]
+ ldr q1,[\enc_key,(\rounds-1)*16]
+ str q0,[\dec_key,(\rounds-1)*16]
+ str q1,[\dec_key]
+ ldp q0,q1,[\enc_key,1*16]
+ ldp q2,q3,[\enc_key,(1+2)*16]
+ ldp q4,q5,[\enc_key,(1+4)*16]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ ldp q6,q7,[\enc_key,(1+6)*16]
+ aesimc v2.16b,v2.16b
+ aesimc v3.16b,v3.16b
+ stp q1,q0,[\dec_key,(\rounds-1-2)*16]
+ aesimc v4.16b,v4.16b
+ aesimc v5.16b,v5.16b
+ stp q3,q2,[\dec_key,(\rounds-1-4)*16]
+ ldr q0,[\enc_key,(1+8)*16]
+ aesimc v6.16b,v6.16b
+ aesimc v7.16b,v7.16b
+ stp q5,q4,[\dec_key,(\rounds-1-6)*16]
+ aesimc v0.16b,v0.16b
+ stp q7,q6,[\dec_key,(\rounds-1-8)*16]
+ str q0,[\dec_key,(\rounds-1-9)*16]
+.endm
+/**
+ void aes_keyexp_128_aes(const uint8_t * key,
+ uint8_t * exp_key_enc, uint8_t * exp_key_dec)
+*/
+ key .req x0
+ exp_key_enc .req x1
+ exp_key_dec .req x2
+ .equ KEY_LEN, (128/8)
+ w_tmp0 .req w3
+ w_tmp1 .req w4
+ w_tmp2 .req w5
+ w_tmp3 .req w6
+ w_tmp4 .req w7
+ declare_var_vector_reg dest,0
+ declare_var_vector_reg zero,1
+ declare_var_vector_reg src, 2
+
+
+ .global aes_keyexp_128_aes
+ .type aes_keyexp_128_aes, %function
+
+aes_keyexp_128_aes:
+ .set rcon,1
+ .set off,0
+ .rept 10
+ round_128 off,rcon
+ .set off,off+1
+ .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b)
+ .endr
+
+ export_dec_key NUM_ROUNDS(128),exp_key_enc,exp_key_dec
+ ret
+ .size aes_keyexp_128_aes, .-aes_keyexp_128_aes
+ .global aes_keyexp_128_enc_aes
+ .type aes_keyexp_128_enc_aes, %function
+aes_keyexp_128_enc_aes:
+ .set rcon,1
+ .set off,0
+ .rept 10
+ round_128 off,rcon
+ .set off,off+1
+ .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b)
+ .endr
+ ret
+ .size aes_keyexp_128_enc_aes, .-aes_keyexp_128_enc_aes \ No newline at end of file
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S
new file mode 100644
index 000000000..2ba46060c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_192_aarch64_aes.S
@@ -0,0 +1,136 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+/*
+Macros
+*/
+#define NUM_ROUNDS(a) (7+(a)/32)
+.macro declare_var_vector_reg name:req,reg:req
+ q\name .req q\reg
+ v\name .req v\reg
+ s\name .req s\reg
+.endm
+.macro round_192 off:req,rcon:req
+ .if \off == 0
+ ldp w_tmp0,w_tmp1,[key]
+ ldp w_tmp2,w_tmp3,[key,8]
+ ldp w_tmp4,w_tmp5,[key,16]
+ movi vzero.4s,0
+ dup vsrc.4s,w_tmp5
+ stp w_tmp0,w_tmp1,[exp_key_enc]
+ stp w_tmp4,w_tmp5,[exp_key_enc,16]
+ stp w_tmp2,w_tmp3,[exp_key_enc,8]
+ .endif
+ mov w0,\rcon
+ mov vdest.16b,vzero.16b
+ aese vdest.16b,vsrc.16b
+ mov w_tmp,vdest.s[0]
+ eor w_tmp0,w_tmp0,w0
+ eor w_tmp0,w_tmp0,w_tmp,ror 8
+ eor w_tmp1,w_tmp0,w_tmp1
+ eor w_tmp2,w_tmp1,w_tmp2
+ eor w_tmp3,w_tmp2,w_tmp3
+ .if \off < 7
+ eor w_tmp4,w_tmp4,w_tmp3
+ eor w_tmp5,w_tmp5,w_tmp4
+ dup vsrc.4s,w_tmp5
+ stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)]
+ stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8]
+ stp w_tmp4,w_tmp5,[exp_key_enc,KEY_LEN*(\off+1)+16]
+ .else
+ stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)]
+ stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8]
+ .endif
+.endm
+
+.macro export_dec_key rounds:req,enc_key:req,dec_key:req
+ ldr q0,[\enc_key]
+ ldr q1,[\enc_key,(\rounds-1)*16]
+ str q0,[\dec_key,(\rounds-1)*16]
+ str q1,[\dec_key]
+ ldp q0,q1,[\enc_key,1*16]
+ ldp q2,q3,[\enc_key,(1+2)*16]
+ ldp q4,q5,[\enc_key,(1+4)*16]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ ldp q6,q7,[\enc_key,(1+6)*16]
+ aesimc v2.16b,v2.16b
+ aesimc v3.16b,v3.16b
+ stp q1,q0,[\dec_key,(\rounds-1-2)*16]
+ ldp q0,q1,[\enc_key,(1+8)*16]
+ aesimc v4.16b,v4.16b
+ aesimc v5.16b,v5.16b
+ stp q3,q2,[\dec_key,(\rounds-1-4)*16]
+ aesimc v6.16b,v6.16b
+ aesimc v7.16b,v7.16b
+ stp q5,q4,[\dec_key,(\rounds-1-6)*16]
+ ldr q2,[\enc_key,(1+10)*16]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ stp q7,q6,[\dec_key,(\rounds-1-8)*16]
+ aesimc v2.16b,v2.16b
+ stp q1,q0,[\dec_key,(\rounds-1-10)*16]
+ str q2,[\dec_key,(\rounds-1-11)*16]
+.endm
+/**
+ void aes_keyexp_192_aes(const uint8_t * key,
+ uint8_t * exp_key_enc, uint8_t * exp_key_dec)
+*/
+ key .req x0
+ exp_key_enc .req x1
+ exp_key_dec .req x2
+ .equ KEY_LEN, (192/8)
+ w_tmp0 .req w3
+ w_tmp1 .req w4
+ w_tmp2 .req w5
+ w_tmp3 .req w6
+ w_tmp .req w7
+ w_tmp4 .req w9
+ w_tmp5 .req w10
+ declare_var_vector_reg dest,0
+ declare_var_vector_reg zero,1
+ declare_var_vector_reg src, 2
+
+
+ .global aes_keyexp_192_aes
+ .type aes_keyexp_192_aes, %function
+
+aes_keyexp_192_aes:
+ .set rcon,1
+ .set off,0
+ .rept 8
+ round_192 off,rcon
+ .set off,off+1
+ .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b)
+ .endr
+ export_dec_key NUM_ROUNDS(192),exp_key_enc,exp_key_dec
+ ret
+ .size aes_keyexp_192_aes, .-aes_keyexp_192_aes
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S
new file mode 100644
index 000000000..5433b2ff6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_256_aarch64_aes.S
@@ -0,0 +1,153 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+
+ .text
+/*
+Macros
+*/
+#define NUM_ROUNDS(a) (7+(a)/32)
+.macro declare_var_vector_reg name:req,reg:req
+ q\name .req q\reg
+ v\name .req v\reg
+ s\name .req s\reg
+.endm
+.macro round_256 off:req,rcon:req,export_dec_key
+ .if \off == 0
+ ldp w_tmp6,w_tmp7,[key,24]
+ ldp w_tmp0,w_tmp1,[key]
+ ldp w_tmp2,w_tmp3,[key,8]
+ ldp w_tmp4,w_tmp5,[key,16]
+ movi vzero.4s,0
+ dup vsrc.4s,w_tmp7
+ stp w_tmp6,w_tmp7,[exp_key_enc,24]
+ stp w_tmp0,w_tmp1,[exp_key_enc]
+ stp w_tmp4,w_tmp5,[exp_key_enc,16]
+ stp w_tmp2,w_tmp3,[exp_key_enc,8]
+ .endif
+ mov w0,\rcon
+ mov vdest.16b,vzero.16b
+ aese vdest.16b,vsrc.16b
+ mov w_tmp,vdest.s[0]
+ eor w_tmp0,w_tmp0,w0
+ eor w_tmp0,w_tmp0,w_tmp,ror 8
+ eor w_tmp1,w_tmp0,w_tmp1
+ eor w_tmp2,w_tmp1,w_tmp2
+ eor w_tmp3,w_tmp2,w_tmp3
+ .if \off < 6
+ dup vsrc.4s,w_tmp3
+ mov vdest.16b,vzero.16b
+ aese vdest.16b,vsrc.16b
+ mov w_tmp,vdest.s[0]
+ eor w_tmp4,w_tmp4,w_tmp
+ eor w_tmp5,w_tmp5,w_tmp4
+ eor w_tmp6,w_tmp6,w_tmp5
+ eor w_tmp7,w_tmp7,w_tmp6
+ dup vsrc.4s,w_tmp7
+ stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)]
+ stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8]
+ stp w_tmp4,w_tmp5,[exp_key_enc,KEY_LEN*(\off+1)+16]
+ stp w_tmp6,w_tmp7,[exp_key_enc,KEY_LEN*(\off+1)+24]
+ .else
+ stp w_tmp0,w_tmp1,[exp_key_enc,KEY_LEN*(\off+1)]
+ stp w_tmp2,w_tmp3,[exp_key_enc,KEY_LEN*(\off+1)+8]
+ .endif
+.endm
+
+.macro export_dec_key rounds:req,enc_key:req,dec_key:req
+ ldr q0,[\enc_key]
+ ldr q1,[\enc_key,(\rounds-1)*16]
+ str q0,[\dec_key,(\rounds-1)*16]
+ str q1,[\dec_key]
+ ldp q0,q1,[\enc_key,1*16]
+ ldp q2,q3,[\enc_key,(1+2)*16]
+ ldp q4,q5,[\enc_key,(1+4)*16]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ ldp q6,q7,[\enc_key,(1+6)*16]
+ aesimc v2.16b,v2.16b
+ aesimc v3.16b,v3.16b
+ stp q1,q0,[\dec_key,(\rounds-1-2)*16]
+ ldp q0,q1,[\enc_key,(1+8)*16]
+ aesimc v4.16b,v4.16b
+ aesimc v5.16b,v5.16b
+ stp q3,q2,[\dec_key,(\rounds-1-4)*16]
+ ldp q2,q3,[\enc_key,(1+10)*16]
+
+ aesimc v6.16b,v6.16b
+ aesimc v7.16b,v7.16b
+ stp q5,q4,[\dec_key,(\rounds-1-6)*16]
+ ldr q4,[\enc_key,(1+12)*16]
+ aesimc v0.16b,v0.16b
+ aesimc v1.16b,v1.16b
+ stp q7,q6,[\dec_key,(\rounds-1-8)*16]
+ aesimc v2.16b,v2.16b
+ aesimc v3.16b,v3.16b
+ stp q1,q0,[\dec_key,(\rounds-1-10)*16]
+ aesimc v4.16b,v4.16b
+ stp q3,q2,[\dec_key,(\rounds-1-12)*16]
+ str q4,[\dec_key,(\rounds-1-13)*16]
+.endm
+/**
+ void aes_keyexp_256_aes(const uint8_t * key,
+ uint8_t * exp_key_enc, uint8_t * exp_key_dec)
+*/
+ key .req x0
+ exp_key_enc .req x1
+ exp_key_dec .req x2
+ .equ KEY_LEN, (256/8)
+ w_tmp0 .req w3
+ w_tmp1 .req w4
+ w_tmp2 .req w5
+ w_tmp3 .req w6
+ w_tmp .req w7
+ w_tmp4 .req w9
+ w_tmp5 .req w10
+ w_tmp6 .req w11
+ w_tmp7 .req w12
+ declare_var_vector_reg dest,0
+ declare_var_vector_reg zero,1
+ declare_var_vector_reg src, 2
+
+
+ .global aes_keyexp_256_aes
+ .type aes_keyexp_256_aes, %function
+
+aes_keyexp_256_aes:
+ .set rcon,1
+ .set off,0
+ .rept 7
+ round_256 off,rcon,1
+ .set off,off+1
+ .set rcon,(rcon << 1) ^ ((rcon >> 7) * 0x11b)
+ .endr
+ export_dec_key NUM_ROUNDS(256),exp_key_enc,exp_key_dec
+ ret
+ .size aes_keyexp_256_aes, .-aes_keyexp_256_aes
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c
new file mode 100644
index 000000000..14c9889ac
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_aarch64_dispatcher.c
@@ -0,0 +1,72 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+#undef PROVIDER_BASIC
+#define PROVIDER_BASIC(a) (void*)0
+
+DEFINE_INTERFACE_DISPATCHER(aes_keyexp_128)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES))
+ return PROVIDER_INFO(aes_keyexp_128_aes);
+
+ return PROVIDER_BASIC(aes_keyexp_128);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_keyexp_128_enc)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES))
+ return PROVIDER_INFO(aes_keyexp_128_enc_aes);
+
+ return PROVIDER_BASIC(aes_keyexp_128_enc);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_keyexp_192)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES))
+ return PROVIDER_INFO(aes_keyexp_192_aes);
+
+ return PROVIDER_BASIC(aes_keyexp_192);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(aes_keyexp_256)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if ((auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES))
+ return PROVIDER_INFO(aes_keyexp_256_aes);
+
+ return PROVIDER_BASIC(aes_keyexp_256);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S
new file mode 100644
index 000000000..aa7c32576
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/keyexp_multibinary_aarch64.S
@@ -0,0 +1,35 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface aes_keyexp_128
+mbin_interface aes_keyexp_128_enc
+mbin_interface aes_keyexp_192
+mbin_interface aes_keyexp_256
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c
new file mode 100644
index 000000000..6c918858e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aarch64_dispatcher.c
@@ -0,0 +1,102 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+#undef PROVIDER_BASIC
+#define PROVIDER_BASIC(a) (void*)0
+
+static unsigned long is_crypto_available(void)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ return (auxval & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_enc)
+{
+ if (is_crypto_available()) {
+ return PROVIDER_INFO(XTS_AES_128_enc_ce);
+ }
+ return PROVIDER_BASIC(XTS_AES_128_enc);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_dec)
+{
+ if (is_crypto_available()) {
+ return PROVIDER_INFO(XTS_AES_128_dec_ce);
+ }
+ return PROVIDER_BASIC(XTS_AES_128_dec);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_enc_expanded_key)
+{
+ if (is_crypto_available()) {
+ return PROVIDER_INFO(XTS_AES_128_enc_expanded_key_ce);
+ }
+ return PROVIDER_BASIC(XTS_AES_128_enc_expanded_key);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_128_dec_expanded_key)
+{
+ if (is_crypto_available()) {
+ return PROVIDER_INFO(XTS_AES_128_dec_expanded_key_ce);
+ }
+ return PROVIDER_BASIC(XTS_AES_128_dec_expanded_key);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_enc)
+{
+ if (is_crypto_available()) {
+ return PROVIDER_INFO(XTS_AES_256_enc_ce);
+ }
+ return PROVIDER_BASIC(XTS_AES_256_enc);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_dec)
+{
+ if (is_crypto_available()) {
+ return PROVIDER_INFO(XTS_AES_256_dec_ce);
+ }
+ return PROVIDER_BASIC(XTS_AES_256_dec);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_enc_expanded_key)
+{
+ if (is_crypto_available()) {
+ return PROVIDER_INFO(XTS_AES_256_enc_expanded_key_ce);
+ }
+ return PROVIDER_BASIC(XTS_AES_256_enc_expanded_key);
+}
+
+DEFINE_INTERFACE_DISPATCHER(XTS_AES_256_dec_expanded_key)
+{
+ if (is_crypto_available()) {
+ return PROVIDER_INFO(XTS_AES_256_dec_expanded_key_ce);
+ }
+ return PROVIDER_BASIC(XTS_AES_256_dec_expanded_key);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S
new file mode 100644
index 000000000..318c1e8a4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_common.S
@@ -0,0 +1,214 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+.altmacro
+.macro aes_key_expand_next next:req,prev:req,ctx:req
+ .if \next == 9
+ mov \ctx, 0x1b
+ .endif
+ dup vdest.4s,vKey\prev\().s[3]
+ ext vtmp.16b,vzero.16b,vKey\prev\().16b,#12
+ aese vdest.16b,vzero.16b
+ eor vKey\next\().16b,vKey\prev\().16b,vtmp.16b
+ ext vtmp.16b,vzero.16b,vtmp.16b,#12
+ eor vKey\next\().16b,vKey\next\().16b,vtmp.16b
+ ext vtmp.16b,vzero.16b,vtmp.16b,#12
+ mov tmpw,vdest.s[0]
+ eor tmpw,\ctx,tmpw,ror 8
+ dup vdest.4s,tmpw
+ eor vKey\next\().16b,vKey\next\().16b,vtmp.16b
+ mov \ctx,ctx,lsl 1
+ eor vKey\next\().16b,vKey\next\().16b,vdest.16b
+.endm
+
+/* when loadin key = 0
+ * arg1 = input key
+ * arg2 = rcon ctx register (optional)
+ * when loading key > 0
+ * arg1 = rcon ctx register (optional)
+ */
+.macro aes_key_expand key:req,arg1,arg2
+ .if \key == 0
+ ld1 {vKey\key\().4s},[\arg1]
+ movi vzero.4s, 0
+ .ifb \arg2
+ mov rcon,#0x01
+ .endif
+
+ .ifnb \arg2
+ mov \arg2,#0x01
+ .endif
+ .endif
+
+ .if \key > 0
+ prev=\key-1
+ .ifb \arg1
+ aes_key_expand_next \key,%prev,rcon
+ .endif
+
+ .ifnb \arg1
+ aes_key_expand_next \key,%prev,\arg1
+ .endif
+ .endif
+.endm
+
+.macro aes_round block:req,key:req,mode:req
+ .if \key < 9
+ .if mode == 0
+ aese \block\().16b,vKey\key\().16b
+ aesmc \block\().16b,\block\().16b
+ .else
+ aesd \block\().16b,vKey\key\().16b
+ aesimc \block\().16b,\block\().16b
+ .endif
+ .endif
+ .if \key == 9
+ .if mode == 0
+ aese \block\().16b,vKey\key\().16b
+ .else
+ aesd \block\().16b,vKey\key\().16b
+ .endif
+ .endif
+ .if \key == 10
+ eor \block\().16b,\block\().16b,vKey\key\().16b
+ .endif
+.endm
+
+.macro aes_round_interleave b0:req,b1:req,b2:req,b3:req,key:req,mode:req,last_key
+ .if \key < 9
+ .if \mode == 0
+ aese \b0\().16b,vKey\key\().16b
+ aesmc \b0\().16b,\b0\().16b
+ aese \b1\().16b,vKey\key\().16b
+ aesmc \b1\().16b,\b1\().16b
+ aese \b2\().16b,vKey\key\().16b
+ aesmc \b2\().16b,\b2\().16b
+ aese \b3\().16b,vKey\key\().16b
+ aesmc \b3\().16b,\b3\().16b
+ .else
+ aesd \b0\().16b,vKey\key\().16b
+ aesimc \b0\().16b,\b0\().16b
+ aesd \b1\().16b,vKey\key\().16b
+ aesimc \b1\().16b,\b1\().16b
+ aesd \b2\().16b,vKey\key\().16b
+ aesimc \b2\().16b,\b2\().16b
+ aesd \b3\().16b,vKey\key\().16b
+ aesimc \b3\().16b,\b3\().16b
+ .endif
+ .endif
+
+ .if \key == 9
+ .if \mode == 0
+ aese \b0\().16b,vKey\key\().16b
+ eor \b0\().16b,\b0\().16b,vKey\last_key\().16b
+ aese \b1\().16b,vKey\key\().16b
+ eor \b1\().16b,\b1\().16b,vKey\last_key\().16b
+ aese \b2\().16b,vKey\key\().16b
+ eor \b2\().16b,\b2\().16b,vKey\last_key\().16b
+ aese \b3\().16b,vKey\key\().16b
+ eor \b3\().16b,\b3\().16b,vKey\last_key\().16b
+ .else
+ aesd \b0\().16b,vKey\key\().16b
+ eor \b0\().16b,\b0\().16b,vKey\last_key\().16b
+ aesd \b1\().16b,vKey\key\().16b
+ eor \b1\().16b,\b1\().16b,vKey\last_key\().16b
+ aesd \b2\().16b,vKey\key\().16b
+ eor \b2\().16b,\b2\().16b,vKey\last_key\().16b
+ aesd \b3\().16b,vKey\key\().16b
+ eor \b3\().16b,\b3\().16b,vKey\last_key\().16b
+ .endif
+ .endif
+.endm
+
+.macro aes_rounds_interleave b0:req,b1:req,b2:req,b3:req,mode
+ aes_round_interleave \b0,\b1,\b2,\b3,0,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,1,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,2,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,3,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,4,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,5,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,6,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,7,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,8,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,9,\mode,10
+.endm
+
+.macro aes_rounds blk:req,mode:req
+ aes_round \blk,0,\mode
+ aes_round \blk,1,\mode
+ aes_round \blk,2,\mode
+ aes_round \blk,3,\mode
+ aes_round \blk,4,\mode
+ aes_round \blk,5,\mode
+ aes_round \blk,6,\mode
+ aes_round \blk,7,\mode
+ aes_round \blk,8,\mode
+ aes_round \blk,9,\mode
+ aes_round \blk,10,\mode
+.endm
+
+/* load k1/k2 from memory and encrypt the tweak by k2
+ * boths keys will share the same set of registers
+ * but will never overlap (k2 is used only once and discarded)
+ */
+.macro keyload_and_encrypt_tweak iv:req,k2:req,k1:req
+ ldp qKey0,qKey1,[\k2],#32
+ aes_enc_round \iv,0
+ ldp qKey2,qKey3,[\k2],#32
+ aes_enc_round \iv,1
+ ldp qKey0,qKey1,[\k1],#32
+ aes_enc_round \iv,2
+ ldp qKey4,qKey5,[\k2],#32
+ aes_enc_round \iv,3
+ ldp qKey2,qKey3,[\k1],#32
+ aes_enc_round \iv,4
+ ldp qKey6,qKey7,[\k2],#32
+ aes_enc_round \iv,5
+ ldp qKey4,qKey5,[\k1],#32
+ aes_enc_round \iv,6
+ ldp qKey8,qKey9,[k2],#32
+ aes_enc_round \iv,7
+ ldp qKey6,qKey7,[\k1],#32
+ aes_enc_round \iv,8
+ ld1 {vKey10.16b},[\k2],#16
+ aes_enc_round \iv,9
+ ldp qKey8,qKey9,[\k1],#32
+ aes_enc_round \iv,10
+ ld1 {vKey10.16b},[\k1],#16
+.endm
+
+.macro save_stack
+ stp d8,d9,[sp, -32]!
+ add tmpbuf,sp,16
+.endm
+
+.macro restore_stack
+ ldp d8,d9,[sp],32
+.endm
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S
new file mode 100644
index 000000000..ceae2d3c0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_dec.S
@@ -0,0 +1,116 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+
+#include "xts_aes_128_common.S"
+#include "xts_aes_common.S"
+
+.macro vswap vec1:req,vec2:req
+ mov vtmp.16b,\vec1\().16b
+ mov \vec1\().16b,\vec2\().16b
+ mov \vec2\().16b,vtmp.16b
+.endm
+
+/* encrypt the tweak by tweak key (k2), and at the same time
+ * to expand encryption key (k1)
+ * even though two sets of keys share the same set of registers
+ * they never overlap at any given time (k2 is used only once and discarded)
+ */
+.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req
+ aes_key_expand 0,\k2
+ aes_enc_round \iv,0
+ aes_key_expand 1
+ aes_enc_round \iv,1
+ aes_key_expand 0,\k1,rcon2
+ aes_key_expand 2
+ aes_enc_round \iv,2
+ aes_key_expand 1,rcon2
+ aes_key_expand 3
+ aes_enc_round \iv,3
+ aes_key_expand 2,rcon2
+ aes_key_expand 4
+ aes_enc_round \iv,4
+ aes_key_expand 3,rcon2
+ aes_key_expand 5
+ aes_enc_round \iv,5
+ aes_key_expand 4,rcon2
+ aes_key_expand 6
+ aes_enc_round \iv,6
+ aes_key_expand 5,rcon2
+ aes_key_expand 7
+ aes_enc_round \iv,7
+ aes_key_expand 6,rcon2
+ aes_key_expand 8
+ aes_enc_round \iv,8
+ aes_key_expand 7,rcon2
+ aes_key_expand 9
+ aes_enc_round \iv,9
+ aes_key_expand 8,rcon2
+ aes_key_expand 10
+ aes_enc_round \iv,10
+ aes_key_expand 9,rcon2
+ aes_key_expand 10,rcon2
+
+ // transform encryption key into decrption key
+ aesimc vKey1.16b,vKey1.16b
+ vswap vKey0,vKey10
+ aesimc vKey9.16b,vKey9.16b
+
+ aesimc vKey2.16b,vKey2.16b
+ aesimc vKey8.16b,vKey8.16b
+ vswap vKey1,vKey9
+
+ aesimc vKey3.16b,vKey3.16b
+ aesimc vKey7.16b,vKey7.16b
+ vswap vKey2,vKey8
+
+ aesimc vKey4.16b,vKey4.16b
+ aesimc vKey6.16b,vKey6.16b
+ vswap vKey3,vKey7
+
+ aesimc vKey5.16b,vKey5.16b
+ vswap vKey4,vKey6
+.endm
+
+/*
+ * void XTS_AES_128_dec_ce(
+ * uint8_t *k2, //!< key used for tweaking, 16 bytes
+ * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes
+ * uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ * uint64_t N, //!< sector size, in bytes
+ * const uint8_t *ct, //!< ciphertext sector input data
+ * uint8_t *pt //!< plaintext sector output data
+ * );
+*/
+ .global XTS_AES_128_dec_ce
+ .type XTS_AES_128_dec_ce, %function
+XTS_AES_128_dec_ce:
+ xts_aes_crypt 1,keyexp_and_encrypt_tweak vIV0,key2,key1
+ .size XTS_AES_128_dec_ce, .-XTS_AES_128_dec_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S
new file mode 100644
index 000000000..23ed14a38
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_128_enc.S
@@ -0,0 +1,91 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+
+#include "xts_aes_128_common.S"
+#include "xts_aes_common.S"
+
+/* encrypt the tweak by tweak key (k2), and at the same time
+ * to expand encryption key (k1)
+ * even though two sets of keys share the same set of registers
+ * they never overlap at any given time (k2 is used once and discarded)
+ */
+.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req
+ aes_key_expand 0,\k2
+ aes_enc_round \iv,0
+ aes_key_expand 1
+ aes_enc_round \iv,1
+ aes_key_expand 0,\k1,rcon2
+ aes_key_expand 2
+ aes_enc_round \iv,2
+ aes_key_expand 1,rcon2
+ aes_key_expand 3
+ aes_enc_round \iv,3
+ aes_key_expand 2,rcon2
+ aes_key_expand 4
+ aes_enc_round \iv,4
+ aes_key_expand 3,rcon2
+ aes_key_expand 5
+ aes_enc_round \iv,5
+ aes_key_expand 4,rcon2
+ aes_key_expand 6
+ aes_enc_round \iv,6
+ aes_key_expand 5,rcon2
+ aes_key_expand 7
+ aes_enc_round \iv,7
+ aes_key_expand 6,rcon2
+ aes_key_expand 8
+ aes_enc_round \iv,8
+ aes_key_expand 7,rcon2
+ aes_key_expand 9
+ aes_enc_round \iv,9
+ aes_key_expand 8,rcon2
+ aes_key_expand 10
+ aes_enc_round \iv,10
+ aes_key_expand 9,rcon2
+ aes_key_expand 10,rcon2
+.endm
+
+
+/*
+ * void XTS_AES_128_enc_ce(
+ * uint8_t *k2, //!< key used for tweaking, 16 bytes
+ * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes
+ * uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ * uint64_t N, //!< sector size, in bytes
+ * const uint8_t *pt, //!< cleartext sector input data
+ * uint8_t *ct //!< ciphertext sector output data
+ * );
+ */
+ .global XTS_AES_128_enc_ce
+ .type XTS_AES_128_enc_ce, %function
+XTS_AES_128_enc_ce:
+ xts_aes_crypt 0,keyexp_and_encrypt_tweak vIV0,key2,key1
+ .size XTS_AES_128_enc_ce, .-XTS_AES_128_enc_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S
new file mode 100644
index 000000000..e6535dba3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_common.S
@@ -0,0 +1,247 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+.altmacro
+.macro aes_key_expand_next out0:req,out1:req,in0:req,in1:req,ctx:req
+ dup vdest.4s,vKey\in1\().s[3]
+ ext vtmp.16b,vzero.16b,vKey\in0\().16b,#12
+ aese vdest.16b,vzero.16b
+ eor vKey\out0\().16b,vKey\in0\().16b,vtmp.16b
+ ext vtmp.16b,vzero.16b,vtmp.16b,#12
+ eor vKey\out0\().16b,vKey\out0\().16b,vtmp.16b
+ ext vtmp.16b,vzero.16b,vtmp.16b,#12
+ mov tmpw,vdest.s[0]
+ eor tmpw,\ctx,tmpw,ror 8
+ dup vdest.4s,tmpw
+ eor vKey\out0\().16b,vKey\out0\().16b,vtmp.16b
+ mov \ctx,ctx,lsl 1
+ eor vKey\out0\().16b,vKey\out0\().16b,vdest.16b
+
+ .if \out1 < 14
+ dup vdest.4s, vKey\out0\().s[3]
+ ext vtmp.16b, vzero.16b,vKey\in1\().16b,#12
+ aese vdest.16b,vzero.16b
+ eor vKey\out1\().16b,vKey\in1\().16b,vtmp.16b
+ ext vtmp.16b,vzero.16b,vtmp.16b,#12
+ eor vKey\out1\().16b,vKey\out1\().16b,vtmp.16b
+ ext vtmp.16b,vzero.16b,vtmp.16b,#12
+ eor vKey\out1\().16b,vKey\out1\().16b,vtmp.16b
+ eor vKey\out1\().16b,vKey\out1\().16b,vdest.16b
+ .endif
+.endm
+
+/* when loadin key = 0
+ * arg1 = input key
+ * arg2 = rcon ctx register (optional)
+ * when loading key > 0
+ * arg1 = rcon ctx register (optional)
+ */
+.macro aes_key_expand key:req,arg1,arg2
+ .if \key == 0
+ ld1 {vKey0.4s,vKey1.4s},[\arg1]
+ movi vzero.4s, 0
+ .ifb \arg2
+ mov rcon,#0x01
+ .endif
+
+ .ifnb \arg2
+ mov \arg2,#0x01
+ .endif
+ .endif
+
+ .if \key > 0
+ in0=\key-2
+ in1=\key-1
+ out0=\key
+ out1=\key+1
+ .ifb \arg1
+ aes_key_expand_next %out0,%out1,%in0,%in1,rcon
+ .endif
+
+ .ifnb \arg1
+ aes_key_expand_next %out0,%out1,%in0,%in1,\arg1
+ .endif
+ .endif
+.endm
+
+.macro aes_round block:req,key:req,mode:req
+ .if \key < 13
+ .if mode == 0
+ aese \block\().16b,vKey\key\().16b
+ aesmc \block\().16b,\block\().16b
+ .else
+ aesd \block\().16b,vKey\key\().16b
+ aesimc \block\().16b,\block\().16b
+ .endif
+ .endif
+ .if \key == 13
+ .if mode == 0
+ aese \block\().16b,vKey\key\().16b
+ .else
+ aesd \block\().16b,vKey\key\().16b
+ .endif
+ .endif
+ .if \key == 14
+ eor \block\().16b,\block\().16b,vKey\key\().16b
+ .endif
+.endm
+
+.macro aes_round_interleave b0:req,b1:req,b2:req,b3:req,key:req,mode:req,last_key
+ .if \key < 13
+ .if \mode == 0
+ aese \b0\().16b,vKey\key\().16b
+ aesmc \b0\().16b,\b0\().16b
+ aese \b1\().16b,vKey\key\().16b
+ aesmc \b1\().16b,\b1\().16b
+ aese \b2\().16b,vKey\key\().16b
+ aesmc \b2\().16b,\b2\().16b
+ aese \b3\().16b,vKey\key\().16b
+ aesmc \b3\().16b,\b3\().16b
+ .else
+ aesd \b0\().16b,vKey\key\().16b
+ aesimc \b0\().16b,\b0\().16b
+ aesd \b1\().16b,vKey\key\().16b
+ aesimc \b1\().16b,\b1\().16b
+ aesd \b2\().16b,vKey\key\().16b
+ aesimc \b2\().16b,\b2\().16b
+ aesd \b3\().16b,vKey\key\().16b
+ aesimc \b3\().16b,\b3\().16b
+ .endif
+ .endif
+
+ .if \key == 13
+ .if \mode == 0
+ aese \b0\().16b,vKey\key\().16b
+ eor \b0\().16b,\b0\().16b,vKey\last_key\().16b
+ aese \b1\().16b,vKey\key\().16b
+ eor \b1\().16b,\b1\().16b,vKey\last_key\().16b
+ aese \b2\().16b,vKey\key\().16b
+ eor \b2\().16b,\b2\().16b,vKey\last_key\().16b
+ aese \b3\().16b,vKey\key\().16b
+ eor \b3\().16b,\b3\().16b,vKey\last_key\().16b
+ .else
+ aesd \b0\().16b,vKey\key\().16b
+ eor \b0\().16b,\b0\().16b,vKey\last_key\().16b
+ aesd \b1\().16b,vKey\key\().16b
+ eor \b1\().16b,\b1\().16b,vKey\last_key\().16b
+ aesd \b2\().16b,vKey\key\().16b
+ eor \b2\().16b,\b2\().16b,vKey\last_key\().16b
+ aesd \b3\().16b,vKey\key\().16b
+ eor \b3\().16b,\b3\().16b,vKey\last_key\().16b
+ .endif
+ .endif
+.endm
+
+
+
+.macro aes_rounds_interleave b0:req,b1:req,b2:req,b3:req,mode
+ aes_round_interleave \b0,\b1,\b2,\b3,0,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,1,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,2,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,3,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,4,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,5,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,6,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,7,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,8,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,9,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,10,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,11,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,12,\mode
+ aes_round_interleave \b0,\b1,\b2,\b3,13,\mode,14
+.endm
+
+
+.macro aes_rounds blk:req,mode:req
+ aes_round \blk,0,\mode
+ aes_round \blk,1,\mode
+ aes_round \blk,2,\mode
+ aes_round \blk,3,\mode
+ aes_round \blk,4,\mode
+ aes_round \blk,5,\mode
+ aes_round \blk,6,\mode
+ aes_round \blk,7,\mode
+ aes_round \blk,8,\mode
+ aes_round \blk,9,\mode
+ aes_round \blk,10,\mode
+ aes_round \blk,11,\mode
+ aes_round \blk,12,\mode
+ aes_round \blk,13,\mode
+ aes_round \blk,14,\mode
+.endm
+
+/* load k1/k2 from memory and encrypt the tweak by k2
+ * boths keys will share the same set of registers
+ * but will never overlap (k2 is used only once and discarded)
+ */
+.macro keyload_and_encrypt_tweak iv:req,k2:req,k1:req
+ ldp qKey0,qKey1,[\k2],#32
+ aes_enc_round \iv,0
+ ldp qKey2,qKey3,[\k2],#32
+ aes_enc_round \iv,1
+ ldp qKey0,qKey1,[\k1],#32
+ aes_enc_round \iv,2
+ ldp qKey4,qKey5,[\k2],#32
+ aes_enc_round \iv,3
+ ldp qKey2,qKey3,[\k1],#32
+ aes_enc_round \iv,4
+ ldp qKey6,qKey7,[\k2],#32
+ aes_enc_round \iv,5
+ ldp qKey4,qKey5,[\k1],#32
+ aes_enc_round \iv,6
+ ldp qKey8,qKey9,[k2],#32
+ aes_enc_round \iv,7
+ ldp qKey6,qKey7,[\k1],#32
+ aes_enc_round \iv,8
+ ldp qKey10,qKey11,[k2],#32
+ aes_enc_round \iv,9
+ ldp qKey8,qKey9,[\k1],#32
+ aes_enc_round \iv,10
+ ldp qKey12,qKey13,[k2],#32
+ aes_enc_round \iv,11
+ ldp qKey10,qKey11,[\k1],#32
+ aes_enc_round \iv,12
+ ld1 {vKey14.16b},[k2],#16
+ aes_enc_round \iv,13
+ ldp qKey12,qKey13,[\k1],#32
+ aes_enc_round \iv,14
+ ld1 {vKey14.16b},[\k1],#16
+.endm
+
+.macro save_stack
+ stp d8,d9,[sp, -48]!
+ stp d10,d11,[sp, 16]
+ add tmpbuf,sp,32
+.endm
+
+.macro restore_stack
+ ldp d10,d11,[sp, 16]
+ ldp d8,d9,[sp], 48
+.endm
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S
new file mode 100644
index 000000000..aa46ded08
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_dec.S
@@ -0,0 +1,116 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+
+#include "xts_aes_256_common.S"
+#include "xts_aes_common.S"
+
+.macro vswap vec1:req,vec2:req
+ mov vtmp.16b,\vec1\().16b
+ mov \vec1\().16b,\vec2\().16b
+ mov \vec2\().16b,vtmp.16b
+.endm
+
+/* encrypt the tweak by tweak key (k2), and at the same time
+ * to expand encryption key (k1)
+ * even though two sets of keys share the same set of registers
+ * they never overlap at any given time (k2 is used only once and discarded)
+ */
+.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req
+ aes_key_expand 0,\k2
+ aes_enc_round \iv,0
+ aes_enc_round \iv,1
+ aes_key_expand 2
+ aes_key_expand 0,\k1,rcon2
+ aes_enc_round \iv,2
+ aes_enc_round \iv,3
+ aes_key_expand 4
+ aes_key_expand 2,rcon2
+ aes_enc_round \iv,4
+ aes_enc_round \iv,5
+ aes_key_expand 6
+ aes_key_expand 4,rcon2
+ aes_enc_round \iv,6
+ aes_enc_round \iv,7
+ aes_key_expand 8
+ aes_key_expand 6,rcon2
+ aes_enc_round \iv,8
+ aes_enc_round \iv,9
+ aes_key_expand 10
+ aes_key_expand 8,rcon2
+ aes_enc_round \iv,10
+ aes_enc_round \iv,11
+ aes_key_expand 12
+ aes_key_expand 10,rcon2
+ aes_enc_round \iv,12
+ aes_enc_round \iv,13
+ aes_key_expand 14
+ aes_key_expand 12,rcon2
+ aes_enc_round \iv,14
+ aes_key_expand 14,rcon2
+
+ // transform encryption key into decrption key
+ aesimc vKey1.16b,vKey1.16b
+ vswap vKey0,vKey14
+ aesimc vKey13.16b,vKey13.16b
+ aesimc vKey2.16b,vKey2.16b
+ vswap vKey1,vKey13
+ aesimc vKey12.16b,vKey12.16b
+ aesimc vKey3.16b,vKey3.16b
+ vswap vKey2,vKey12
+ aesimc vKey11.16b,vKey11.16b
+ aesimc vKey4.16b,vKey4.16b
+ vswap vKey3,vKey11
+ aesimc vKey10.16b,vKey10.16b
+ aesimc vKey5.16b,vKey5.16b
+ vswap vKey4,vKey10
+ aesimc vKey9.16b,vKey9.16b
+ aesimc vKey6.16b,vKey6.16b
+ vswap vKey5,vKey9
+ aesimc vKey8.16b,vKey8.16b
+ aesimc vKey7.16b,vKey7.16b
+ vswap vKey6,vKey8
+.endm
+
+/*
+ * void XTS_AES_256_dec_ce(
+ * uint8_t *k2, //!< key used for tweaking, 32 bytes
+ * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 32 bytes
+ * uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ * uint64_t N, //!< sector size, in bytes
+ * const uint8_t *ct, //!< ciphertext sector input data
+ * uint8_t *pt //!< plaintext sector output data
+ * );
+*/
+ .global XTS_AES_256_dec_ce
+ .type XTS_AES_256_dec_ce, %function
+XTS_AES_256_dec_ce:
+ xts_aes_crypt 1,keyexp_and_encrypt_tweak vIV0,key2,key1
+ .size XTS_AES_256_dec_ce, .-XTS_AES_256_dec_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S
new file mode 100644
index 000000000..8e4088a4d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_256_enc.S
@@ -0,0 +1,88 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+
+#include "xts_aes_256_common.S"
+#include "xts_aes_common.S"
+
+/* encrypt the tweak by tweak key (k2), and at the same time
+ * to expand encryption key (k1)
+ * even though two sets of keys share the same set of registers
+ * they never overlap at any given time (k2 is used once and discarded)
+ */
+.macro keyexp_and_encrypt_tweak iv:req,k2:req,k1:req
+ aes_key_expand 0,\k2
+ aes_enc_round \iv,0
+ aes_enc_round \iv,1
+ aes_key_expand 2
+ aes_key_expand 0,\k1,rcon2
+ aes_enc_round \iv,2
+ aes_enc_round \iv,3
+ aes_key_expand 4
+ aes_key_expand 2,rcon2
+ aes_enc_round \iv,4
+ aes_enc_round \iv,5
+ aes_key_expand 6
+ aes_key_expand 4,rcon2
+ aes_enc_round \iv,6
+ aes_enc_round \iv,7
+ aes_key_expand 8
+ aes_key_expand 6,rcon2
+ aes_enc_round \iv,8
+ aes_enc_round \iv,9
+ aes_key_expand 10
+ aes_key_expand 8,rcon2
+ aes_enc_round \iv,10
+ aes_enc_round \iv,11
+ aes_key_expand 12
+ aes_key_expand 10,rcon2
+ aes_enc_round \iv,12
+ aes_enc_round \iv,13
+ aes_key_expand 14
+ aes_key_expand 12,rcon2
+ aes_enc_round \iv,14
+ aes_key_expand 14,rcon2
+.endm
+
+/*
+ * void XTS_AES_256_enc_ce(
+ * uint8_t *k2, //!< key used for tweaking, 16 bytes
+ * uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes
+ * uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ * uint64_t N, //!< sector size, in bytes
+ * const uint8_t *pt, //!< cleartext sector input data
+ * uint8_t *ct //!< ciphertext sector output data
+ * );
+ */
+ .global XTS_AES_256_enc_ce
+ .type XTS_AES_256_enc_ce, %function
+XTS_AES_256_enc_ce:
+ xts_aes_crypt 0,keyexp_and_encrypt_tweak vIV0,key2,key1
+ .size XTS_AES_256_enc_ce, .-XTS_AES_256_enc_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S
new file mode 100644
index 000000000..c32a13820
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_aes_common.S
@@ -0,0 +1,232 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+.macro declare_var_vector_reg name:req,reg:req
+.ifdef q\name
+ .unreq q\name
+ .unreq v\name
+ .unreq s\name
+ .unreq d\name
+.endif
+ .set q\name , \reg
+ q\name .req q\reg
+ v\name .req v\reg
+ s\name .req s\reg
+ d\name .req d\reg
+.endm
+
+.macro declare_var_generic_reg name:req,reg:req
+ \name .req x\reg
+ x\name .req x\reg
+ w\name .req w\reg
+.endm
+
+ declare_var_vector_reg zero ,0
+ declare_var_vector_reg tmp,1
+ declare_var_vector_reg mask,2
+ declare_var_vector_reg dest,3
+ declare_var_vector_reg blk0,4
+ declare_var_vector_reg blk1,5
+ declare_var_vector_reg blk2,6
+ declare_var_vector_reg blk3,7
+ declare_var_vector_reg Key11,8
+ declare_var_vector_reg Key12,9
+ declare_var_vector_reg Key13,10
+ declare_var_vector_reg Key14,11
+ declare_var_vector_reg SavedIv,16
+ declare_var_vector_reg IV0,17
+ declare_var_vector_reg IV1,18
+ declare_var_vector_reg IV2,19
+ declare_var_vector_reg IV3,20
+ declare_var_vector_reg Key0,21
+ declare_var_vector_reg Key1,22
+ declare_var_vector_reg Key2,23
+ declare_var_vector_reg Key3,24
+ declare_var_vector_reg Key4,25
+ declare_var_vector_reg Key5,26
+ declare_var_vector_reg Key6,27
+ declare_var_vector_reg Key7,28
+ declare_var_vector_reg Key8,29
+ declare_var_vector_reg Key9,30
+ declare_var_vector_reg Key10,31
+
+.macro aes_enc_round block:req,key:req
+ aes_round \block,\key,0
+.endm
+
+.macro aes_dec_round block:req,key:req
+ aes_round \block,\key,1
+.endm
+
+.macro update_iv current:req,next:req
+ mov ivh,\current\().d[1]
+ mov ivl,\current\().d[0]
+ mov tmpw,#0x87
+ extr tmpx2,ivh,ivh,#32
+ extr ivh,ivh,ivl,#63
+ and tmpw,tmpw,tmpw2,asr#31
+ eor ivl,tmpx,ivl,lsl#1
+ mov \next\().d[1],ivh
+ mov \next\().d[0],ivl
+.endm
+
+.macro process_4_blks inp:req,outp:req,mode:req,is_tail
+ update_iv vIV0,vIV1
+ update_iv vIV1,vIV2
+ ldp qblk0,qblk1,[\inp],#32
+ ldp qblk2,qblk3,[\inp],#32
+ .ifnb \is_tail
+ update_iv vIV2, vSavedIv
+ update_iv vSavedIv,vIV3
+ .else
+ update_iv vIV2,vIV3
+ .endif
+ eor vblk0.16b,vblk0.16b,vIV0.16b
+ eor vblk1.16b,vblk1.16b,vIV1.16b
+ eor vblk2.16b,vblk2.16b,vIV2.16b
+ eor vblk3.16b,vblk3.16b,vIV3.16b
+
+ aes_rounds_interleave vblk0,vblk1,vblk2,vblk3,\mode
+ eor vblk0.16b,vblk0.16b,vIV0.16b
+ eor vblk1.16b,vblk1.16b,vIV1.16b
+ stp qblk0,qblk1,[\outp],#32
+ eor vblk2.16b,vblk2.16b,vIV2.16b
+ eor vblk3.16b,vblk3.16b,vIV3.16b
+ stp qblk2,qblk3,[\outp],#32
+ .ifb \is_tail
+ update_iv vIV3,vIV0
+ .endif
+.endm
+
+.macro process_1_blk inp:req,outp:req,mode:req
+ ld1 {vblk0.16b},[\inp],#16
+ eor vblk0.16b,vblk0.16b,vIV0.16b
+ aes_rounds vblk0,\mode
+ eor vblk0.16b,vblk0.16b,vIV0.16b
+ str qblk0,[\outp], #16
+.endm
+
+ key2 .req x0
+ key1 .req x1
+ iv .req x2
+ bytes .req x3
+ inp .req x4
+ outp .req x5
+ rcon .req w6
+ blocks .req x7
+ tmpx .req x8
+ tmpw .req w8
+ tmpw2 .req w9
+ tmpx2 .req x9
+ ivl .req x10
+ ivh .req x11
+ lastblk .req x12
+ tmpbuf .req x13
+ tailcnt .req x14
+ rcon2 .req w15
+
+.macro xts_aes_crypt mode:req,expander,more:vararg
+ save_stack
+
+ ld1 {vIV0.16b},[iv],16
+ .ifnb \expander
+ \expander\() \more
+ .endif
+ lsr blocks,bytes,4
+ and tailcnt,bytes,#0x0F
+
+ cmp bytes,16
+ b.lt .return
+
+.process_4_blks:
+ cmp blocks, 4
+ b.lt .singles
+ subs blocks,blocks,4
+ /* in decryption mode, check whether this is
+ * last block before the less-than-one-block tail
+ * need to swap tweak in this case
+ */
+ .if \mode == 1
+ b.gt .not_tail_4blk
+ cmp tailcnt,1
+ b.lt .not_tail_4blk
+ process_4_blks inp,outp,\mode,1
+ b .process_4_blks
+.not_tail_4blk:
+ .endif
+ process_4_blks inp,outp,\mode
+ b .process_4_blks
+
+.singles:
+ subs blocks,blocks,#1
+ b.lt .checktail
+ /* in decryption mode, check whether this is
+ *last block before the less-than-one-block tail
+ * need to swap tweak in this case
+ */
+ .if \mode == 1
+ b.gt .not_tail_1blk
+ cmp tailcnt,1
+ b.lt .not_tail_1blk
+ mov vSavedIv.16b, vIV0.16b
+ update_iv vSavedIv, vIV0
+ process_1_blk inp,outp,\mode
+ b .checktail
+.not_tail_1blk:
+ .endif
+ process_1_blk inp,outp,\mode
+ update_iv vIV0,vIV0
+ b .singles
+.checktail:
+ cmp tailcnt,1
+ b.lt .return
+ sub lastblk,outp,#16
+.copytail:
+ subs tailcnt,tailcnt,#1
+ ldrb tmpw,[lastblk,tailcnt]
+ strb tmpw,[outp,tailcnt]
+ ldrb tmpw,[inp,tailcnt]
+ strb tmpw,[tmpbuf,tailcnt]
+ b.gt .copytail
+ and tailcnt,bytes,#0x0F
+.steal:
+ cmp tailcnt,15
+ ldrb tmpw,[lastblk,tailcnt]
+ strb tmpw,[tmpbuf,tailcnt]
+ add tailcnt,tailcnt,#1
+ b.lt .steal
+ .if \mode == 1
+ mov vIV0.16b,vSavedIv.16b
+ .endif
+ process_1_blk tmpbuf,lastblk,\mode
+.return:
+ restore_stack
+ ret
+.endm
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S
new file mode 100644
index 000000000..9549ebfa0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_dec.S
@@ -0,0 +1,49 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+
+#include "xts_aes_128_common.S"
+#include "xts_aes_common.S"
+
+/*
+ * void XTS_AES_128_dec_expanded_key_ce(
+ * uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes - encryption key is used
+ * uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*11 bytes
+ * uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ * uint64_t N, //!< sector size, in bytes
+ * const uint8_t *ct, //!< ciphertext sector input data
+ * uint8_t *pt //!< plaintext sector output data
+ * );
+*/
+ .global XTS_AES_128_dec_expanded_key_ce
+ .type XTS_AES_128_dec_expanded_key_ce, %function
+XTS_AES_128_dec_expanded_key_ce:
+ xts_aes_crypt 1,keyload_and_encrypt_tweak,vIV0,key2,key1
+ .size XTS_AES_128_dec_expanded_key_ce, .-XTS_AES_128_dec_expanded_key_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S
new file mode 100644
index 000000000..1f2d2db2e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_128_enc.S
@@ -0,0 +1,49 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+
+#include "xts_aes_128_common.S"
+#include "xts_aes_common.S"
+
+/*
+ * void XTS_AES_128_enc_expanded_key_ce(
+ * uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes
+ * uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*11 bytes
+ * uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ * uint64_t N, //!< sector size, in bytes
+ * const uint8_t *pt, //!< plaintext sector input data
+ * uint8_t *ct //!< ciphertext sector output data
+ * );
+ */
+ .global XTS_AES_128_enc_expanded_key_ce
+ .type XTS_AES_128_enc_expanded_key_ce, %function
+XTS_AES_128_enc_expanded_key_ce:
+ xts_aes_crypt 0,keyload_and_encrypt_tweak,vIV0,key2,key1
+ .size XTS_AES_128_enc_expanded_key_ce, .-XTS_AES_128_enc_expanded_key_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S
new file mode 100644
index 000000000..95c8bf63d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_dec.S
@@ -0,0 +1,49 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+
+#include "xts_aes_256_common.S"
+#include "xts_aes_common.S"
+
+/*
+ * void XTS_AES_256_dec_expanded_key_ce(
+ * uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes - encryption key is used
+ * uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*15 bytes
+ * uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ * uint64_t N, //!< sector size, in bytes
+ * const uint8_t *ct, //!< ciphertext sector input data
+ * uint8_t *pt //!< plaintext sector output data
+ * );
+*/
+ .global XTS_AES_256_dec_expanded_key_ce
+ .type XTS_AES_256_dec_expanded_key_ce, %function
+XTS_AES_256_dec_expanded_key_ce:
+ xts_aes_crypt 1,keyload_and_encrypt_tweak,vIV0,key2,key1
+ .size XTS_AES_256_dec_expanded_key_ce, .-XTS_AES_256_dec_expanded_key_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S
new file mode 100644
index 000000000..bd840a994
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_keyexp_aes_256_enc.S
@@ -0,0 +1,49 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+
+#include "xts_aes_256_common.S"
+#include "xts_aes_common.S"
+
+/*
+ * void XTS_AES_256_enc_expanded_key_ce(
+ * uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes
+ * uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*15 bytes
+ * uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ * uint64_t N, //!< sector size, in bytes
+ * const uint8_t *pt, //!< plaintext sector input data
+ * uint8_t *ct //!< ciphertext sector output data
+ * );
+ */
+ .global XTS_AES_256_enc_expanded_key_ce
+ .type XTS_AES_256_enc_expanded_key_ce, %function
+XTS_AES_256_enc_expanded_key_ce:
+ xts_aes_crypt 0,keyload_and_encrypt_tweak,vIV0,key2,key1
+ .size XTS_AES_256_enc_expanded_key_ce, .-XTS_AES_256_enc_expanded_key_ce
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S
new file mode 100644
index 000000000..af77d885b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aarch64/xts_multibinary_aarch64.S
@@ -0,0 +1,39 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aarch64_multibinary.h"
+
+mbin_interface XTS_AES_128_enc
+mbin_interface XTS_AES_128_dec
+mbin_interface XTS_AES_128_enc_expanded_key
+mbin_interface XTS_AES_128_dec_expanded_key
+mbin_interface XTS_AES_256_enc
+mbin_interface XTS_AES_256_dec
+mbin_interface XTS_AES_256_enc_expanded_key
+mbin_interface XTS_AES_256_dec_expanded_key
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm
new file mode 100644
index 000000000..22f00b395
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/aes_common.asm
@@ -0,0 +1,377 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _AES_COMMON_ASM_
+%define _AES_COMMON_ASM_
+
+%include "reg_sizes.asm"
+
+;; =============================================================================
+;; Generic macro to produce code that executes %%OPCODE instruction
+;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
+;; All three operands of the instruction come from registers.
+;; Note: if 3 blocks are left at the end instruction is produced to operate all
+;; 4 blocks (full width of ZMM)
+
+%macro ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 14
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OPCODE %2 ; [in] instruction name
+%define %%DST0 %3 ; [out] destination ZMM register
+%define %%DST1 %4 ; [out] destination ZMM register
+%define %%DST2 %5 ; [out] destination ZMM register
+%define %%DST3 %6 ; [out] destination ZMM register
+%define %%SRC1_0 %7 ; [in] source 1 ZMM register
+%define %%SRC1_1 %8 ; [in] source 1 ZMM register
+%define %%SRC1_2 %9 ; [in] source 1 ZMM register
+%define %%SRC1_3 %10 ; [in] source 1 ZMM register
+%define %%SRC2_0 %11 ; [in] source 2 ZMM register
+%define %%SRC2_1 %12 ; [in] source 2 ZMM register
+%define %%SRC2_2 %13 ; [in] source 2 ZMM register
+%define %%SRC2_3 %14 ; [in] source 2 ZMM register
+
+%assign reg_idx 0
+%assign blocks_left %%NUM_BLOCKS
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%DSTREG %%DST %+ reg_idx
+%xdefine %%SRC1REG %%SRC1_ %+ reg_idx
+%xdefine %%SRC2REG %%SRC2_ %+ reg_idx
+ %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG
+%undef %%DSTREG
+%undef %%SRC1REG
+%undef %%SRC2REG
+%assign reg_idx (reg_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+
+%xdefine %%DSTREG %%DST %+ reg_idx
+%xdefine %%SRC1REG %%SRC1_ %+ reg_idx
+%xdefine %%SRC2REG %%SRC2_ %+ reg_idx
+
+%if blocks_left == 1
+ %%OPCODE XWORD(%%DSTREG), XWORD(%%SRC1REG), XWORD(%%SRC2REG)
+%elif blocks_left == 2
+ %%OPCODE YWORD(%%DSTREG), YWORD(%%SRC1REG), YWORD(%%SRC2REG)
+%elif blocks_left == 3
+ %%OPCODE %%DSTREG, %%SRC1REG, %%SRC2REG
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Loads specified number of AES blocks into ZMM registers
+;; %%FLAGS are optional and only affect behavior when 3 trailing blocks are left
+;; - if %%FlAGS not provided then exactly 3 blocks are loaded (move and insert)
+;; - if "load_4_instead_of_3" option is passed then 4 blocks are loaded
+%macro ZMM_LOAD_BLOCKS_0_16 7-8
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%INP %2 ; [in] input data pointer to read from
+%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%DST0 %4 ; [out] ZMM register with loaded data
+%define %%DST1 %5 ; [out] ZMM register with loaded data
+%define %%DST2 %6 ; [out] ZMM register with loaded data
+%define %%DST3 %7 ; [out] ZMM register with loaded data
+%define %%FLAGS %8 ; [in] optional "load_4_instead_of_3"
+
+%assign src_offset 0
+%assign dst_idx 0
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%DSTREG %%DST %+ dst_idx
+ vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%undef %%DSTREG
+%assign src_offset (src_offset + 64)
+%assign dst_idx (dst_idx + 1)
+%endrep
+
+%assign blocks_left (%%NUM_BLOCKS % 4)
+%xdefine %%DSTREG %%DST %+ dst_idx
+
+%if blocks_left == 1
+ vmovdqu8 XWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 2
+ vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 3
+%ifidn %%FLAGS, load_4_instead_of_3
+ vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%else
+ vmovdqu8 YWORD(%%DSTREG), [%%INP + %%DATA_OFFSET + src_offset]
+ vinserti64x2 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset + 32], 2
+%endif
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Loads specified number of AES blocks into ZMM registers using mask register
+;; for the last loaded register (xmm, ymm or zmm).
+;; Loads take place at 1 byte granularity.
+%macro ZMM_LOAD_MASKED_BLOCKS_0_16 8
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%INP %2 ; [in] input data pointer to read from
+%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%DST0 %4 ; [out] ZMM register with loaded data
+%define %%DST1 %5 ; [out] ZMM register with loaded data
+%define %%DST2 %6 ; [out] ZMM register with loaded data
+%define %%DST3 %7 ; [out] ZMM register with loaded data
+%define %%MASK %8 ; [in] mask register
+
+%assign src_offset 0
+%assign dst_idx 0
+%assign blocks_left %%NUM_BLOCKS
+
+%if %%NUM_BLOCKS > 0
+%rep (((%%NUM_BLOCKS + 3) / 4) - 1)
+%xdefine %%DSTREG %%DST %+ dst_idx
+ vmovdqu8 %%DSTREG, [%%INP + %%DATA_OFFSET + src_offset]
+%undef %%DSTREG
+%assign src_offset (src_offset + 64)
+%assign dst_idx (dst_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+%endif ; %if %%NUM_BLOCKS > 0
+
+%xdefine %%DSTREG %%DST %+ dst_idx
+
+%if blocks_left == 1
+ vmovdqu8 XWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%elif blocks_left == 2
+ vmovdqu8 YWORD(%%DSTREG){%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%elif (blocks_left == 3 || blocks_left == 4)
+ vmovdqu8 %%DSTREG{%%MASK}{z}, [%%INP + %%DATA_OFFSET + src_offset]
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Stores specified number of AES blocks from ZMM registers
+%macro ZMM_STORE_BLOCKS_0_16 7
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OUTP %2 ; [in] output data pointer to write to
+%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%SRC0 %4 ; [in] ZMM register with data to store
+%define %%SRC1 %5 ; [in] ZMM register with data to store
+%define %%SRC2 %6 ; [in] ZMM register with data to store
+%define %%SRC3 %7 ; [in] ZMM register with data to store
+
+%assign dst_offset 0
+%assign src_idx 0
+
+%rep (%%NUM_BLOCKS / 4)
+%xdefine %%SRCREG %%SRC %+ src_idx
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG
+%undef %%SRCREG
+%assign dst_offset (dst_offset + 64)
+%assign src_idx (src_idx + 1)
+%endrep
+
+%assign blocks_left (%%NUM_BLOCKS % 4)
+%xdefine %%SRCREG %%SRC %+ src_idx
+
+%if blocks_left == 1
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], XWORD(%%SRCREG)
+%elif blocks_left == 2
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG)
+%elif blocks_left == 3
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], YWORD(%%SRCREG)
+ vextracti32x4 [%%OUTP + %%DATA_OFFSET + dst_offset + 32], %%SRCREG, 2
+%endif
+
+%endmacro
+
+;; =============================================================================
+;; Stores specified number of AES blocks from ZMM registers with mask register
+;; for the last loaded register (xmm, ymm or zmm).
+;; Stores take place at 1 byte granularity.
+%macro ZMM_STORE_MASKED_BLOCKS_0_16 8
+%define %%NUM_BLOCKS %1 ; [in] numerical value, number of AES blocks (0 to 16)
+%define %%OUTP %2 ; [in] output data pointer to write to
+%define %%DATA_OFFSET %3 ; [in] offset to the output pointer (GP or numerical)
+%define %%SRC0 %4 ; [in] ZMM register with data to store
+%define %%SRC1 %5 ; [in] ZMM register with data to store
+%define %%SRC2 %6 ; [in] ZMM register with data to store
+%define %%SRC3 %7 ; [in] ZMM register with data to store
+%define %%MASK %8 ; [in] mask register
+
+%assign dst_offset 0
+%assign src_idx 0
+%assign blocks_left %%NUM_BLOCKS
+
+%if %%NUM_BLOCKS > 0
+%rep (((%%NUM_BLOCKS + 3) / 4) - 1)
+%xdefine %%SRCREG %%SRC %+ src_idx
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset], %%SRCREG
+%undef %%SRCREG
+%assign dst_offset (dst_offset + 64)
+%assign src_idx (src_idx + 1)
+%assign blocks_left (blocks_left - 4)
+%endrep
+%endif ; %if %%NUM_BLOCKS > 0
+
+%xdefine %%SRCREG %%SRC %+ src_idx
+
+%if blocks_left == 1
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, XWORD(%%SRCREG)
+%elif blocks_left == 2
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, YWORD(%%SRCREG)
+%elif (blocks_left == 3 || blocks_left == 4)
+ vmovdqu8 [%%OUTP + %%DATA_OFFSET + dst_offset]{%%MASK}, %%SRCREG
+%endif
+
+%endmacro
+
+;;; ===========================================================================
+;;; Handles AES encryption rounds
+;;; It handles special cases: the last and first rounds
+;;; Optionally, it performs XOR with data after the last AES round.
+;;; Uses NROUNDS parameterto check what needs to be done for the current round.
+;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+%macro ZMM_AESENC_ROUND_BLOCKS_0_16 12
+%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3
+%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7
+%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11
+%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15
+%define %%KEY %5 ; [in] zmm containing round key
+%define %%ROUND %6 ; [in] round number
+%define %%D0_3 %7 ; [in] zmm or no_data; plain/cipher text blocks 0-3
+%define %%D4_7 %8 ; [in] zmm or no_data; plain/cipher text blocks 4-7
+%define %%D8_11 %9 ; [in] zmm or no_data; plain/cipher text blocks 8-11
+%define %%D12_15 %10 ; [in] zmm or no_data; plain/cipher text blocks 12-15
+%define %%NUMBL %11 ; [in] number of blocks; numerical value
+%define %%NROUNDS %12 ; [in] number of rounds; numerical value
+
+;;; === first AES round
+%if (%%ROUND < 1)
+ ;; round 0
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+%endif ; ROUND 0
+
+;;; === middle AES rounds
+%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS)
+ ;; rounds 1 to 9/11/13
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenc, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+%endif ; rounds 1 to 9/11/13
+
+;;; === last AES round
+%if (%%ROUND > %%NROUNDS)
+ ;; the last round - mix enclast with text xor's
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesenclast, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+
+;;; === XOR with data
+%ifnidn %%D0_3, no_data
+%ifnidn %%D4_7, no_data
+%ifnidn %%D8_11, no_data
+%ifnidn %%D12_15, no_data
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%D0_3, %%D4_7, %%D8_11, %%D12_15
+%endif ; !no_data
+%endif ; !no_data
+%endif ; !no_data
+%endif ; !no_data
+
+%endif ; The last round
+
+%endmacro
+
+;;; ===========================================================================
+;;; Handles AES decryption rounds
+;;; It handles special cases: the last and first rounds
+;;; Optionally, it performs XOR with data after the last AES round.
+;;; Uses NROUNDS parameter to check what needs to be done for the current round.
+;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
+%macro ZMM_AESDEC_ROUND_BLOCKS_0_16 12
+%define %%L0B0_3 %1 ; [in/out] zmm; blocks 0 to 3
+%define %%L0B4_7 %2 ; [in/out] zmm; blocks 4 to 7
+%define %%L0B8_11 %3 ; [in/out] zmm; blocks 8 to 11
+%define %%L0B12_15 %4 ; [in/out] zmm; blocks 12 to 15
+%define %%KEY %5 ; [in] zmm containing round key
+%define %%ROUND %6 ; [in] round number
+%define %%D0_3 %7 ; [in] zmm or no_data; cipher text blocks 0-3
+%define %%D4_7 %8 ; [in] zmm or no_data; cipher text blocks 4-7
+%define %%D8_11 %9 ; [in] zmm or no_data; cipher text blocks 8-11
+%define %%D12_15 %10 ; [in] zmm or no_data; cipher text blocks 12-15
+%define %%NUMBL %11 ; [in] number of blocks; numerical value
+%define %%NROUNDS %12 ; [in] number of rounds; numerical value
+
+;;; === first AES round
+%if (%%ROUND < 1)
+ ;; round 0
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+%endif ; ROUND 0
+
+;;; === middle AES rounds
+%if (%%ROUND >= 1 && %%ROUND <= %%NROUNDS)
+ ;; rounds 1 to 9/11/13
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdec, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+%endif ; rounds 1 to 9/11/13
+
+;;; === last AES round
+%if (%%ROUND > %%NROUNDS)
+ ;; the last round - mix enclast with text xor's
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vaesdeclast, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%KEY, %%KEY, %%KEY, %%KEY
+
+;;; === XOR with data
+%ifnidn %%D0_3, no_data
+%ifnidn %%D4_7, no_data
+%ifnidn %%D8_11, no_data
+%ifnidn %%D12_15, no_data
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%NUMBL, vpxorq, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%L0B0_3, %%L0B4_7, %%L0B8_11, %%L0B12_15, \
+ %%D0_3, %%D4_7, %%D8_11, %%D12_15
+%endif ; !no_data
+%endif ; !no_data
+%endif ; !no_data
+%endif ; !no_data
+
+%endif ; The last round
+
+%endmacro
+
+%endif ;; _AES_COMMON_ASM
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm
new file mode 100644
index 000000000..2a879abdd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm
@@ -0,0 +1,431 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; the following defines control the operation of the macros below and
+; need to be defines in the including file
+; KEY_ROUNDS - number of key rounds needed based on key length: 128bit - 11, 192bit - 13 or 256bit - 15
+; EARLY_BLOCKS - number of data block to load before starting computations
+; PARALLEL_BLOCKS - number of blocks of data to process in parallel also the number of xmm regs to reserve for data
+; IV_CNT - number of xmm regs to use for IV data valid values of 0 or 1
+; TMP_CNT - number of tmp xmm register to reserve
+; XMM_USAGE - number of xmm registers to use. must be at least the same as PARALLEL_BLOCKS + 2
+;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;
+; the following instructions set specific macros must be defined in the user file
+; to make use of the AES macros below
+; MOVDQ - move from memory to xmm reg
+; PXOR - XOR of two xmm registers pxor
+; AES_DEC - AES block decode for early key rounds
+; AES_DEC_LAST - AES block decode for last key round
+; or
+; AES_ENC - AES block encode for early key rounds
+; AES_ENC_LAST - AES block encode for last key round
+
+; Three usages of xmm regs: key round cache, blocks data and one temp
+; CKEY_CNT are (number of xmm regs) - PARALLEL_BLOCKS - IV holder - 2 TMP mmx reg
+%assign FIRST_XDATA (0)
+%assign IV_IDX (FIRST_XDATA + PARALLEL_BLOCKS)
+%ifndef IV_CNT
+%define IV_CNT (1)
+%endif
+%assign TMP (IV_IDX + IV_CNT)
+%assign TMP_CNT (2)
+%assign FIRST_CKEY (TMP + TMP_CNT)
+%assign CKEY_CNT (XMM_USAGE - (PARALLEL_BLOCKS + IV_CNT + TMP_CNT))
+
+; Abstract xmm register usages that identify the expected contents of the register
+%define reg(i) xmm %+ i
+%define XDATA(i) xmm %+ i
+%define KEY_REG(i) xmm %+ i
+%define IV_REG(i) xmm %+ i
+
+%define IDX rax
+
+
+
+
+;
+;
+; AES CBC ENCODE MACROS
+;
+;
+
+;
+; CBC_DECRYPT_BLOCKS
+; Decrypts a number of blocks using AES_PARALLEL_ENC_BLOCKS macro
+; Finalized the decryption and saves results in the output
+; places last last buffers crypto text in IV for next buffer
+; updates the index and number of bytes left
+;
+%macro CBC_DECRYPT_BLOCKS 17
+%define %%TOT_ROUNDS %1
+%define %%num_blocks %2 ; can be 0..13
+%define %%EARLY_LOADS %3 ; number of data blocks to laod before processing
+%define %%MOVDQ %4
+%define %%PXOR %5
+%define %%AES_DEC %6
+%define %%AES_DEC_LAST %7
+%define %%CACHED_KEYS %8 ; number of key data cached in xmm regs
+%define %%TMP %9
+%define %%TMP_CNT %10
+%define %%FIRST_CKEY %11
+%define %%KEY_DATA %12
+%define %%FIRST_XDATA %13
+%define %%IN %14 ; input data
+%define %%OUT %15 ; output data
+%define %%IDX %16 ; index into input and output data buffers
+%define %%LEN %17
+
+ AES_PARALLEL_ENC_BLOCKS %%TOT_ROUNDS, %%num_blocks, %%EARLY_LOADS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST, %%CACHED_KEYS, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%KEY_DATA, %%FIRST_XDATA, %%IN, %%OUT, %%IDX
+
+ ;
+ ; XOR the result of each block's decrypt with the previous block's cypher text (C)
+ ;
+ %assign i 0
+ %rep (%%num_blocks)
+ %%PXOR XDATA(i), XDATA(IV_IDX) ; XOR result with previous block's C
+ %%MOVDQ [%%OUT + %%IDX + i*16], XDATA(i) ; save plain text to out
+ %%MOVDQ XDATA(IV_IDX), [%%IN + IDX + i*16] ; load IV with current block C
+ %assign i (i+1)
+ %endrep
+
+ add %%IDX, %%num_blocks*16
+ sub %%LEN, %%num_blocks*16
+%endmacro
+
+
+;
+; CBC_ENC_INIT
+; XOR first data block with the IV data
+%macro CBC_ENC_INIT 7
+%define %%P_FIRST %1
+%define %%IV_IDX %2
+%define %%MOVDQ %3
+%define %%PXOR %4
+%define %%IV %5
+%define %%IN %6 ; input data
+%define %%IDX %7 ; index into input and output data buffers
+
+ %%MOVDQ XDATA(%%P_FIRST), [%%IN + %%IDX + 0*16]
+ %%MOVDQ reg(%%IV_IDX), [%%IV]
+ %%PXOR XDATA(%%P_FIRST), reg(%%IV_IDX)
+%endmacro
+
+;
+; assumptions:
+; LEN is length of data remaining
+; IDX is offset into the data buffer
+;
+; subloops
+; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
+; load first uncached key into TMP0 (if any)
+; AES block encript XDATA(P_FIRST)
+; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(P_FIRST))
+; save current (XDATA(P_FIRST))
+; update indexes for P_FIRST
+; end if data zero
+;
+%macro CBC_ENC_SUBLOOP 17
+%define %%TOT_ROUNDS %1
+%define %%BLOCKS %2 ; can be 1...14
+%define %%START_DATA %3
+%define %%MOVDQ %4
+%define %%PXOR %5
+%define %%AES_DEC %6
+%define %%AES_DEC_LAST %7
+%define %%TMP %8
+%define %%TMP_CNT %9
+%define %%FIRST_CKEY %10
+%define %%CKEY_CNT %11
+%define %%KEYS %12
+%define %%CACHED_KEYS %13
+%define %%IN %14 ; input data
+%define %%OUT %15 ; output data
+%define %%IDX %16 ; index into input and output data buffers
+%define %%LEN %17
+
+ %assign this_blk 0
+ %assign next_blk 1
+ %assign p_first %%START_DATA
+ %assign p_next (p_first+1)
+ ; for number of blocks to be processed in a loop
+ %assign blk 1
+ %rep %%BLOCKS
+ ; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
+ cmp %%LEN, 16
+ %push skip_read
+ je %$skip_read_next
+ %%MOVDQ XDATA(p_next), [%%IN + %%IDX + next_blk*16]
+ %$skip_read_next:
+ %pop
+
+ AES_ENC_BLOCKS %%TOT_ROUNDS, p_first, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%CKEY_CNT, %%KEYS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST
+
+ ; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(p_first))
+ cmp %%LEN, 16
+ %push skip_next
+ je %$skip_next_blk_start
+ %%PXOR XDATA(p_next), XDATA(p_first)
+ %$skip_next_blk_start:
+ %pop
+
+ ; save current (XDATA(p_first))
+ %%MOVDQ [%%OUT + %%IDX + this_blk*16], XDATA(p_first)
+ ; update indexes for p_first
+ add %%IDX, 16
+ sub %%LEN, 16
+
+ %if (blk < %%BLOCKS) ; only insert jz if NOT last block
+ ; end if data zero
+ jz %%END_CBC_ENC_SUBLOOP
+ %endif ; (p_next < %%BLOCKS)
+
+ %assign p_first (p_next)
+ %assign blk (blk+1)
+ %if (blk == %%BLOCKS) ; the last rep loop's read of the next block needs to be into START_DATA
+ %assign p_next (%%START_DATA)
+ %elif (1 == %%BLOCKS)
+ %%MOVDQ XDATA(%%START_DATA), XDATA(p_next)
+ %else
+ %assign p_next (p_next+1)
+ %endif
+ %endrep ; %%BLOCKS
+
+ %%END_CBC_ENC_SUBLOOP:
+%endm ; CBC_ENC_SUBLOOP
+
+
+;
+;
+; AES BLOCK ENCODE MACROS
+;
+;
+
+;
+; FILL_KEY_CACHE
+; Load key data into the cache key xmm regs
+%macro FILL_KEY_CACHE 4
+%define %%CACHED_KEYS %1
+%define %%CKEY_START %2
+%define %%KEY_DATA %3
+%define %%MOVDQ %4
+
+ %assign rnd 0
+ %rep KEY_ROUNDS
+ %if (rnd < %%CACHED_KEYS) ; find the round's key data
+ %assign c (rnd + %%CKEY_START)
+ %%MOVDQ KEY_REG(c), [%%KEY_DATA + rnd*16] ;load sub key into an available register
+ %endif
+ %assign rnd (rnd+1)
+ %endrep
+%endmacro
+
+;
+; SCHEDULE_DATA_LOAD
+; pre-loades message data into xmm regs
+; updates global 'blocks_loaded' that tracks which data blocks have been loaded
+; 'blocks_loaded' is an in/out global and must be declared in the using macro or function
+%macro SCHEDULE_DATA_LOAD 5
+%define %%PARALLEL_DATA %1
+%define %%EARLY_LOADS %2
+%define %%MOVDQ %3
+%define %%IN %4
+%define %%IDX %5
+
+ %if (blocks_loaded < %%PARALLEL_DATA)
+ ; load cipher text
+ %%MOVDQ XDATA(blocks_loaded), [%%IN + %%IDX + blocks_loaded*16]
+ %assign blocks_loaded (blocks_loaded+1)
+ %endif ; (blocks_loaded < %%PARALLEL_DATA)
+%endmacro ; SCHEDULED_EARLY_DATA_LOADS
+
+;
+; INIT_SELECT_KEY
+; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
+; 'current_tmp' is an in/out global and must be declared in the using macro or function
+%macro INIT_SELECT_KEY 6
+%define %%TOT_ROUNDS %1
+%define %%CACHED_KEYS %2
+%define %%KEY_DATA %3
+%define %%FIRST_TMP %4
+%define %%TMP_CNT %5
+%define %%MOVDQ %6
+
+ %assign current_tmp (%%FIRST_TMP)
+ %if (%%TOT_ROUNDS > %%CACHED_KEYS) ; load the first uncached key into temp reg
+ %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%CACHED_KEYS*16]
+ %endif ; (KEY_ROUNDS > CKEY_CNT)
+%endmacro ; SELECT_KEY
+
+;
+; SELECT_KEY
+; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
+; 'current_tmp' is an in/out global and must be declared in the using macro or function
+%macro SELECT_KEY 8
+%define %%ROUND %1
+%define %%TOT_ROUNDS %2
+%define %%CACHED_KEYS %3
+%define %%FIRST_KEY %4
+%define %%KEY_DATA %5
+%define %%FIRST_TMP %6
+%define %%TMP_CNT %7
+%define %%MOVDQ %8
+
+ ; find the key data for this round
+ %if (%%ROUND < %%CACHED_KEYS) ; is it cached
+ %assign key (%%ROUND + %%FIRST_KEY)
+ %else
+ ; Load non-cached key %%ROUND data ping-ponging between temp regs if more than one
+ %assign key (current_tmp) ; use the previous loaded key data
+ %if (1 == %%TMP_CNT)
+ %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%ROUND*16] ; load the next rounds key data
+ %else
+ %assign next_round (%%ROUND+1)
+ %if (next_round < %%TOT_ROUNDS) ; if more rounds to be done
+ %if (current_tmp == %%FIRST_TMP) ; calc the next temp reg to use
+ %assign current_tmp (current_tmp + 1)
+ %else
+ %assign current_tmp (%%FIRST_TMP)
+ %endif ; (current_tmp == %%FIRST_TMP)
+ %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + next_round*16] ; load the next rounds key data
+
+ %endif ; (%%ROUND < KEY_ROUNDS)
+ %endif ; (1 < %%TMP_CNT)
+ %endif ; (%%ROUND < %%CACHED_KEYS)
+%endmacro ; SELECT_KEY
+
+
+;
+; AES_PARALLEL_ENC_BLOCKS
+; preloads some data blocks to be worked on
+; starts the aes block encoding while loading the other blocks to be done in parallel
+; aes block encodes each key round on each block
+%macro AES_PARALLEL_ENC_BLOCKS 16
+%define %%KEY_ROUNDS %1
+%define %%PARALLEL_DATA %2
+%define %%EARLY_LOADS %3
+%define %%MOVDQ %4
+%define %%PXOR %5
+%define %%AES_DEC %6
+%define %%AES_DEC_LAST %7
+%define %%CACHED_KEYS %8
+%define %%TMP %9
+%define %%TMP_CNT %10
+%define %%FIRST_CKEY %11
+%define %%KEY_DATA %12
+%define %%FIRST_XDATA %13
+%define %%IN %14 ; input data
+%define %%OUT %15 ; output data
+%define %%IDX %16 ; index into input and output data buffers
+
+ %assign blocks_loaded 0
+
+ %rep %%EARLY_LOADS
+ SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX ; updates blocks_loaded
+ %endrep ; %%EARLY_LOADS
+
+ %assign current_tmp (TMP)
+ INIT_SELECT_KEY %%KEY_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ %assign round 0
+ %assign key 0
+ %rep KEY_ROUNDS ; for all key rounds
+ SELECT_KEY round, %%KEY_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ %assign i %%FIRST_XDATA
+ %rep %%PARALLEL_DATA ; for each block do the EAS block encode step
+ %if (0 == round)
+ %%PXOR XDATA(i), KEY_REG(key) ; first round's step
+ SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX
+
+ %elif ( (%%KEY_ROUNDS-1) == round )
+ %%AES_DEC_LAST XDATA(i), KEY_REG(key) ; last round's step
+
+ %else
+ %%AES_DEC XDATA(i), KEY_REG(key) ; middle round's (1..last-1) step
+
+ %endif
+ %assign i (i+1)
+ %endrep ;%%PARALLEL_DATA
+ %assign round (round+1)
+ %endrep ;KEY_ROUNDS
+%endmacro ; AES_PARALLEL_ENC_BLOCKS
+
+
+
+;
+; AES_ENC_BLOCKS
+; load first uncached key into TMP0 (if any)
+; AES block encript XDATA(p_first)
+; before using uncached key in TMP0, load next key in TMP1
+; before using uncached key in TMP1, load next key in TMP0
+%macro AES_ENC_BLOCKS 11
+%define %%TOT_ROUNDS %1
+%define %%ENC_BLOCK %2
+%define %%TMP %3
+%define %%TMP_CNT %4
+%define %%FIRST_CKEY %5
+%define %%CACHED_KEYS %6
+%define %%KEY_DATA %7
+%define %%MOVDQ %8
+%define %%PXOR %9
+%define %%AES_ENC %10
+%define %%AES_ENC_LAST %11
+
+ %assign current_tmp (%%TMP)
+ INIT_SELECT_KEY %%TOT_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ %assign round 0
+ %assign key (round + %%FIRST_CKEY)
+ %rep %%TOT_ROUNDS ; for all key rounds
+ ; find the key data for this round
+ SELECT_KEY round, %%TOT_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ ; encrypt block
+ %if (0 == round)
+ %%PXOR XDATA(%%ENC_BLOCK), KEY_REG(key) ; round zero step
+ %elif ( (%%TOT_ROUNDS-1) == round )
+ %%AES_ENC_LAST XDATA(%%ENC_BLOCK), KEY_REG(key) ; last round's step
+ %else
+ %%AES_ENC XDATA(%%ENC_BLOCK), KEY_REG(key) ; rounds 1..last-1 step
+ %endif ; (0 == round)
+
+ %assign round (round+1)
+ %endrep ; KEY_ROUNDS
+%endmacro ; AES_ENC
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm
new file mode 100644
index 000000000..68aa227ca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm
@@ -0,0 +1,162 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_128_sse(void *in,
+; uint8_t *IV,
+; uint8_t keys,
+; void *out,
+; uint64_t len_bytes);
+;
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC macros
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (2)
+%define PARALLEL_BLOCKS (8)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_DEC aesdec
+%define AES_DEC_LAST aesdeclast
+%include "cbc_common.asm"
+
+section .text
+
+align 16
+mk_global aes_cbc_dec_128_sse, function
+func(aes_cbc_dec_128_sse)
+ endbranch
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm
new file mode 100644
index 000000000..d4b6dfb2a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm
@@ -0,0 +1,162 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; routine to do AES128 CBC decrypt
+;; clobbers xmm0-15
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+ vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+ vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+section .text
+
+;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+mk_global aes_cbc_dec_128_avx, function
+func(aes_cbc_dec_128_avx)
+ endbranch
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm
new file mode 100644
index 000000000..4b017d193
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm
@@ -0,0 +1,164 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_192_sse(void *in,
+; uint8_t *IV,
+; uint8_t keys[13], // +1 over key length
+; void *out,
+; uint64_t len_bytes);
+;
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;
+
+%include "reg_sizes.asm"
+
+%define MOVDQ movdqu
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (2)
+%define PARALLEL_BLOCKS (5)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_DEC aesdec
+%define AES_DEC_LAST aesdeclast
+
+%include "cbc_common.asm"
+
+section .text
+
+mk_global aes_cbc_dec_192_sse, function
+func(aes_cbc_dec_192_sse)
+ endbranch
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm
new file mode 100644
index 000000000..2791570ad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm
@@ -0,0 +1,158 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES192 CBC decrypt
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+ vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+ vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+mk_global aes_cbc_dec_192_avx, function
+func(aes_cbc_dec_192_avx)
+ endbranch
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm
new file mode 100644
index 000000000..44c76268e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm
@@ -0,0 +1,161 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_256_sse(void *in,
+; uint8_t *IV,
+; uint8_t keys,
+; void *out,
+; uint64_t len_bytes);
+;
+; arg 1: rcx: pointer to input (cipher text)
+; arg 2: rdx: pointer to IV
+; arg 3: r8: pointer to keys
+; arg 4: r9: pointer to output (plain text)
+; arg 5: sp: length in bytes (multiple of 16)
+;
+
+%include "reg_sizes.asm"
+
+%define MOVDQ movdqu
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_DEC aesdec
+%define AES_DEC_LAST aesdeclast
+
+%include "cbc_common.asm"
+
+mk_global aes_cbc_dec_256_sse, function
+func(aes_cbc_dec_256_sse)
+ endbranch
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm
new file mode 100644
index 000000000..cad1a6bef
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm
@@ -0,0 +1,158 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES256 CBC decrypt
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+ vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+ vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+mk_global aes_cbc_dec_256_avx, function
+func(aes_cbc_dec_256_avx)
+ endbranch
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm
new file mode 100644
index 000000000..6124e2def
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_vaes_avx512.asm
@@ -0,0 +1,519 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2019-2021 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "aes_common.asm"
+%include "reg_sizes.asm"
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+[bits 64]
+default rel
+
+%define zIV zmm0
+%define zBLK_0_3 zmm1
+%define zBLK_4_7 zmm2
+%define zBLK_8_11 zmm3
+%define zBLK_12_15 zmm4
+%define zTMP0 zmm5
+%define zTMP1 zmm6
+%define zTMP2 zmm7
+%define zTMP3 zmm8
+
+%define ZKEY0 zmm17
+%define ZKEY1 zmm18
+%define ZKEY2 zmm19
+%define ZKEY3 zmm20
+%define ZKEY4 zmm21
+%define ZKEY5 zmm22
+%define ZKEY6 zmm23
+%define ZKEY7 zmm24
+%define ZKEY8 zmm25
+%define ZKEY9 zmm26
+%define ZKEY10 zmm27
+%define ZKEY11 zmm28
+%define ZKEY12 zmm29
+%define ZKEY13 zmm30
+%define ZKEY14 zmm31
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define p_in rdi
+%define p_IV rsi
+%define p_keys rdx
+%define p_out rcx
+%define num_bytes r8
+%else
+%define p_in rcx
+%define p_IV rdx
+%define p_keys r8
+%define p_out r9
+%define num_bytes rax
+%endif
+
+%define tmp r10
+%define tmp2 r11
+
+%ifdef CBCS
+%define OFFSET 160
+%else
+%define OFFSET 16
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; macro to preload keys
+;;; - uses ZKEY[0-14] registers (ZMM)
+%macro LOAD_KEYS 2
+%define %%KEYS %1 ; [in] key pointer
+%define %%NROUNDS %2 ; [in] numerical value, number of AES rounds
+ ; excluding 1st and last rounds.
+ ; Example: AES-128 -> value 9
+
+%assign i 0
+%rep (%%NROUNDS + 2)
+ vbroadcastf64x2 ZKEY %+ i, [%%KEYS + 16*i]
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "cool down" pipeline after DECRYPT_16_PARALLEL macro
+;;; code as the number of final blocks is variable.
+;;; Processes the last %%num_final_blocks blocks (1 to 15, can't be 0)
+
+%macro FINAL_BLOCKS 14
+%define %%PLAIN_OUT %1 ; [in] output buffer
+%define %%CIPH_IN %2 ; [in] input buffer
+%define %%LAST_CIPH_BLK %3 ; [in/out] ZMM with IV/last cipher blk (in idx 3)
+%define %%num_final_blocks %4 ; [in] numerical value (1 - 15)
+%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks
+%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks
+%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks
+%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks
+%define %%ZT1 %9 ; [clobbered] ZMM temporary
+%define %%ZT2 %10 ; [clobbered] ZMM temporary
+%define %%ZT3 %11 ; [clobbered] ZMM temporary
+%define %%ZT4 %12 ; [clobbered] ZMM temporary
+%define %%IA0 %13 ; [clobbered] GP temporary
+%define %%NROUNDS %14 ; [in] number of rounds; numerical value
+
+ ;; load plain/cipher text
+%ifdef CBCS
+ ZMM_LOAD_BLOCKS_0_16_OFFSET %%num_final_blocks, %%CIPH_IN, \
+ OFFSET, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%else
+ ZMM_LOAD_BLOCKS_0_16 %%num_final_blocks, %%CIPH_IN, 0, \
+ %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%endif
+ ;; Prepare final cipher text blocks to
+ ;; be XOR'd later after AESDEC
+ valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6
+%if %%num_final_blocks > 4
+ valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6
+%endif
+%if %%num_final_blocks > 8
+ valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6
+%endif
+%if %%num_final_blocks > 12
+ valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6
+%endif
+
+ ;; Update IV with last cipher block
+ ;; to be used later in DECRYPT_16_PARALLEL
+%if %%num_final_blocks == 1
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 2
+%elif %%num_final_blocks == 2
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 4
+%elif %%num_final_blocks == 3
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, 6
+%elif %%num_final_blocks == 4
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_0_3
+%elif %%num_final_blocks == 5
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 2
+%elif %%num_final_blocks == 6
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 4
+%elif %%num_final_blocks == 7
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, 6
+%elif %%num_final_blocks == 8
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_4_7
+%elif %%num_final_blocks == 9
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 2
+%elif %%num_final_blocks == 10
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 4
+%elif %%num_final_blocks == 11
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, 6
+%elif %%num_final_blocks == 12
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_8_11
+%elif %%num_final_blocks == 13
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 2
+%elif %%num_final_blocks == 14
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 4
+%elif %%num_final_blocks == 15
+ valignq %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, 6
+%endif
+
+ ;; AES rounds
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \
+ ZKEY %+ j, j, no_data, no_data, no_data, no_data, \
+ %%num_final_blocks, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; XOR with decrypted blocks to get plain text
+ vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1
+%if %%num_final_blocks > 4
+ vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2
+%endif
+%if %%num_final_blocks > 8
+ vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3
+%endif
+%if %%num_final_blocks > 12
+ vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4
+%endif
+
+ ;; write plain text back to output
+%ifdef CBCS
+ ZMM_STORE_BLOCKS_0_16_OFFSET %%num_final_blocks, %%PLAIN_OUT, \
+ OFFSET, %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%else
+ ZMM_STORE_BLOCKS_0_16 %%num_final_blocks, %%PLAIN_OUT, 0, \
+ %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%endif
+
+%endmacro ; FINAL_BLOCKS
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main AES-CBC decrypt macro
+;;; - operates on single stream
+;;; - decrypts 16 blocks at a time
+%macro DECRYPT_16_PARALLEL 14
+%define %%PLAIN_OUT %1 ; [in] output buffer
+%define %%CIPH_IN %2 ; [in] input buffer
+%define %%LENGTH %3 ; [in/out] number of bytes to process
+%define %%LAST_CIPH_BLK %4 ; [in/out] ZMM with IV (first block) or last cipher block (idx 3)
+%define %%CIPHER_PLAIN_0_3 %5 ; [out] ZMM next 0-3 cipher blocks
+%define %%CIPHER_PLAIN_4_7 %6 ; [out] ZMM next 4-7 cipher blocks
+%define %%CIPHER_PLAIN_8_11 %7 ; [out] ZMM next 8-11 cipher blocks
+%define %%CIPHER_PLAIN_12_15 %8 ; [out] ZMM next 12-15 cipher blocks
+%define %%ZT1 %9 ; [clobbered] ZMM temporary
+%define %%ZT2 %10 ; [clobbered] ZMM temporary
+%define %%ZT3 %11 ; [clobbered] ZMM temporary
+%define %%ZT4 %12 ; [clobbered] ZMM temporary
+%define %%NROUNDS %13 ; [in] number of rounds; numerical value
+%define %%IA0 %14 ; [clobbered] GP temporary
+
+%ifdef CBCS
+ ZMM_LOAD_BLOCKS_0_16_OFFSET 16, %%CIPH_IN, OFFSET, \
+ %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%else
+ vmovdqu8 %%CIPHER_PLAIN_0_3, [%%CIPH_IN]
+ vmovdqu8 %%CIPHER_PLAIN_4_7, [%%CIPH_IN + 64]
+ vmovdqu8 %%CIPHER_PLAIN_8_11, [%%CIPH_IN + 128]
+ vmovdqu8 %%CIPHER_PLAIN_12_15, [%%CIPH_IN + 192]
+%endif
+ ;; prepare first set of cipher blocks for later XOR'ing
+ valignq %%ZT1, %%CIPHER_PLAIN_0_3, %%LAST_CIPH_BLK, 6
+ valignq %%ZT2, %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_0_3, 6
+ valignq %%ZT3, %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_4_7, 6
+ valignq %%ZT4, %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_8_11, 6
+
+ ;; store last cipher text block to be used for next 16 blocks
+ vmovdqa64 %%LAST_CIPH_BLK, %%CIPHER_PLAIN_12_15
+
+ ;; AES rounds
+%assign j 0
+%rep (%%NROUNDS + 2)
+ ZMM_AESDEC_ROUND_BLOCKS_0_16 %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15, \
+ ZKEY %+ j, j, no_data, no_data, no_data, no_data, \
+ 16, %%NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; XOR with decrypted blocks to get plain text
+ vpxorq %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_0_3, %%ZT1
+ vpxorq %%CIPHER_PLAIN_4_7, %%CIPHER_PLAIN_4_7, %%ZT2
+ vpxorq %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_8_11, %%ZT3
+ vpxorq %%CIPHER_PLAIN_12_15, %%CIPHER_PLAIN_12_15, %%ZT4
+
+ ;; write plain text back to output
+%ifdef CBCS
+ ZMM_STORE_BLOCKS_0_16_OFFSET 16, %%PLAIN_OUT, OFFSET, \
+ %%CIPHER_PLAIN_0_3, %%CIPHER_PLAIN_4_7, \
+ %%CIPHER_PLAIN_8_11, %%CIPHER_PLAIN_12_15
+%else
+ vmovdqu8 [%%PLAIN_OUT], %%CIPHER_PLAIN_0_3
+ vmovdqu8 [%%PLAIN_OUT + 64], %%CIPHER_PLAIN_4_7
+ vmovdqu8 [%%PLAIN_OUT + 128], %%CIPHER_PLAIN_8_11
+ vmovdqu8 [%%PLAIN_OUT + 192], %%CIPHER_PLAIN_12_15
+%endif
+ ;; adjust input pointer and length
+ sub %%LENGTH, (16 * 16)
+ add %%CIPH_IN, (16 * OFFSET)
+ add %%PLAIN_OUT, (16 * OFFSET)
+
+%endmacro ; DECRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; AES_CBC_DEC macro decrypts given data.
+;;; Flow:
+;;; - Decrypt all blocks (multiple of 16) up to final 1-15 blocks
+;;; - Decrypt final blocks (1-15 blocks)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro AES_CBC_DEC 7
+%define %%CIPH_IN %1 ;; [in] pointer to input buffer
+%define %%PLAIN_OUT %2 ;; [in] pointer to output buffer
+%define %%KEYS %3 ;; [in] pointer to expanded keys
+%define %%IV %4 ;; [in] pointer to IV
+%define %%LENGTH %5 ;; [in/out] GP register with length in bytes
+%define %%NROUNDS %6 ;; [in] Number of AES rounds; numerical value
+%define %%TMP %7 ;; [clobbered] GP register
+
+ cmp %%LENGTH, 0
+ je %%cbc_dec_done
+
+ vinserti64x2 zIV, zIV, [%%IV], 3
+
+ ;; preload keys
+ LOAD_KEYS %%KEYS, %%NROUNDS
+
+%%decrypt_16_parallel:
+ cmp %%LENGTH, 256
+ jb %%final_blocks
+
+ DECRYPT_16_PARALLEL %%PLAIN_OUT, %%CIPH_IN, %%LENGTH, zIV, \
+ zBLK_0_3, zBLK_4_7, zBLK_8_11, zBLK_12_15, \
+ zTMP0, zTMP1, zTMP2, zTMP3, %%NROUNDS, %%TMP
+ jmp %%decrypt_16_parallel
+
+%%final_blocks:
+ ;; get num final blocks
+ shr %%LENGTH, 4
+ and %%LENGTH, 0xf
+ je %%cbc_dec_done
+
+ cmp %%LENGTH, 8
+ je %%final_num_blocks_is_8
+ jl %%final_blocks_is_1_7
+
+ ; Final blocks 9-15
+ cmp %%LENGTH, 12
+ je %%final_num_blocks_is_12
+ jl %%final_blocks_is_9_11
+
+ ; Final blocks 13-15
+ cmp %%LENGTH, 15
+ je %%final_num_blocks_is_15
+ cmp %%LENGTH, 14
+ je %%final_num_blocks_is_14
+ cmp %%LENGTH, 13
+ je %%final_num_blocks_is_13
+
+%%final_blocks_is_9_11:
+ cmp %%LENGTH, 11
+ je %%final_num_blocks_is_11
+ cmp %%LENGTH, 10
+ je %%final_num_blocks_is_10
+ cmp %%LENGTH, 9
+ je %%final_num_blocks_is_9
+
+%%final_blocks_is_1_7:
+ cmp %%LENGTH, 4
+ je %%final_num_blocks_is_4
+ jl %%final_blocks_is_1_3
+
+ ; Final blocks 5-7
+ cmp %%LENGTH, 7
+ je %%final_num_blocks_is_7
+ cmp %%LENGTH, 6
+ je %%final_num_blocks_is_6
+ cmp %%LENGTH, 5
+ je %%final_num_blocks_is_5
+
+%%final_blocks_is_1_3:
+ cmp %%LENGTH, 3
+ je %%final_num_blocks_is_3
+ cmp %%LENGTH, 2
+ je %%final_num_blocks_is_2
+ jmp %%final_num_blocks_is_1
+
+
+%%final_num_blocks_is_15:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 15, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_14:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 14, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_13:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 13, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_12:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 12, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_11:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 11, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_10:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 10, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_9:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 9, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_8:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 8, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_7:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 7, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_6:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 6, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_5:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 5, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_4:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 4, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_3:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 3, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_2:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 2, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+ jmp %%cbc_dec_done
+
+%%final_num_blocks_is_1:
+ FINAL_BLOCKS %%PLAIN_OUT, %%CIPH_IN, zIV, 1, zBLK_0_3, zBLK_4_7, \
+ zBLK_8_11, zBLK_12_15, zTMP0, zTMP1, zTMP2, zTMP3, \
+ %%TMP, %%NROUNDS
+
+%%cbc_dec_done:
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+section .text
+
+%ifndef CBCS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_128_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+mk_global aes_cbc_dec_128_vaes_avx512,function,internal
+aes_cbc_dec_128_vaes_avx512:
+ endbranch
+%ifidn __OUTPUT_FORMAT__, win64
+ mov num_bytes, [rsp + 8*5]
+%endif
+ AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 9, tmp
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_192_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+mk_global aes_cbc_dec_192_vaes_avx512,function,internal
+aes_cbc_dec_192_vaes_avx512:
+ endbranch
+%ifidn __OUTPUT_FORMAT__, win64
+ mov num_bytes, [rsp + 8*5]
+%endif
+ AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 11, tmp
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_256_vaes_avx512(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+mk_global aes_cbc_dec_256_vaes_avx512,function,internal
+aes_cbc_dec_256_vaes_avx512:
+ endbranch
+%ifidn __OUTPUT_FORMAT__, win64
+ mov num_bytes, [rsp + 8*5]
+%endif
+ AES_CBC_DEC p_in, p_out, p_keys, p_IV, num_bytes, 13, tmp
+
+ ret
+
+%endif ;; CBCS
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_aes_cbc_dec_256_vaes_avx512
+no_aes_cbc_dec_256_vaes_avx512:
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm
new file mode 100644
index 000000000..a7fbf39b4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm
@@ -0,0 +1,137 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 128 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_128_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t *keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%define UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+%define EARLY_BLOCKS (2)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_ENC aesenc
+%define AES_ENC_LAST aesenclast
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_128_x4, function
+func(aes_cbc_enc_128_x4)
+ endbranch
+ FUNC_SAVE
+
+ mov IDX, 0
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm
new file mode 100644
index 000000000..24ab33fe5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm
@@ -0,0 +1,151 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 128 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x8(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;; clobbers all registers except for ARG1 and rbp
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ vmovdqa xmm14, [rsp + 8*16]
+ vmovdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+ vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+ vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_128_x8, function
+func(aes_cbc_enc_128_x8)
+ endbranch
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm
new file mode 100644
index 000000000..b3d80e922
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm
@@ -0,0 +1,149 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+
+;include "mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+ MOVDQ XTMP, %2
+ pxor %1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_192_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_ENC aesenc
+%define AES_ENC_LAST aesenclast
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_192_x4, function
+func(aes_cbc_enc_192_x4)
+ endbranch
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm
new file mode 100644
index 000000000..89d233819
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm
@@ -0,0 +1,147 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_192_x8(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;; clobbers all registers except for ARG1 and rbp
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ vmovdqa xmm14, [rsp + 8*16]
+ vmovdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+ vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+ vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+mk_global aes_cbc_enc_192_x8, function
+func(aes_cbc_enc_192_x8)
+ endbranch
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm
new file mode 100644
index 000000000..ab37668c7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm
@@ -0,0 +1,141 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_ENC aesenc
+%define AES_ENC_LAST aesenclast
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_256_x4, function
+func(aes_cbc_enc_256_x4)
+ endbranch
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm
new file mode 100644
index 000000000..83e53ac11
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm
@@ -0,0 +1,148 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ vmovdqa [rsp + 0*16], xmm6
+ vmovdqa [rsp + 1*16], xmm7
+ vmovdqa [rsp + 2*16], xmm8
+ vmovdqa [rsp + 3*16], xmm9
+ vmovdqa [rsp + 4*16], xmm10
+ vmovdqa [rsp + 5*16], xmm11
+ vmovdqa [rsp + 6*16], xmm12
+ vmovdqa [rsp + 7*16], xmm13
+ vmovdqa [rsp + 8*16], xmm14
+ vmovdqa [rsp + 9*16], xmm15
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ vmovdqa xmm6, [rsp + 0*16]
+ vmovdqa xmm7, [rsp + 1*16]
+ vmovdqa xmm8, [rsp + 2*16]
+ vmovdqa xmm9, [rsp + 3*16]
+ vmovdqa xmm10, [rsp + 4*16]
+ vmovdqa xmm11, [rsp + 5*16]
+ vmovdqa xmm12, [rsp + 6*16]
+ vmovdqa xmm13, [rsp + 7*16]
+ vmovdqa xmm14, [rsp + 8*16]
+ vmovdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+ vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+ vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+mk_global aes_cbc_enc_256_x8, function
+func(aes_cbc_enc_256_x8)
+ endbranch
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm
new file mode 100644
index 000000000..0cc09afe1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm
@@ -0,0 +1,102 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+default rel
+[bits 64]
+
+extern aes_cbc_dec_128_sse
+extern aes_cbc_dec_128_avx
+extern aes_cbc_dec_192_sse
+extern aes_cbc_dec_192_avx
+extern aes_cbc_dec_256_sse
+extern aes_cbc_dec_256_avx
+
+extern aes_cbc_enc_128_x4
+extern aes_cbc_enc_128_x8
+extern aes_cbc_enc_192_x4
+extern aes_cbc_enc_192_x8
+extern aes_cbc_enc_256_x4
+extern aes_cbc_enc_256_x8
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern aes_cbc_dec_128_vaes_avx512
+extern aes_cbc_dec_192_vaes_avx512
+extern aes_cbc_dec_256_vaes_avx512
+%endif
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aesni_cbc interfaces enc and dec
+;;;;
+mbin_interface aes_cbc_dec_128
+mbin_dispatch_init7 aes_cbc_dec_128, \
+ aes_cbc_dec_128_sse, \
+ aes_cbc_dec_128_sse, \
+ aes_cbc_dec_128_avx, \
+ aes_cbc_dec_128_avx, \
+ aes_cbc_dec_128_avx, \
+ aes_cbc_dec_128_vaes_avx512
+
+mbin_interface aes_cbc_dec_192
+mbin_dispatch_init7 aes_cbc_dec_192, \
+ aes_cbc_dec_192_sse, \
+ aes_cbc_dec_192_sse, \
+ aes_cbc_dec_192_avx, \
+ aes_cbc_dec_192_avx, \
+ aes_cbc_dec_192_avx, \
+ aes_cbc_dec_192_vaes_avx512
+
+mbin_interface aes_cbc_dec_256
+mbin_dispatch_init7 aes_cbc_dec_256, \
+ aes_cbc_dec_256_sse, \
+ aes_cbc_dec_256_sse, \
+ aes_cbc_dec_256_avx, \
+ aes_cbc_dec_256_avx, \
+ aes_cbc_dec_256_avx, \
+ aes_cbc_dec_256_vaes_avx512
+
+mbin_interface aes_cbc_enc_128
+mbin_dispatch_init aes_cbc_enc_128, aes_cbc_enc_128_x4, aes_cbc_enc_128_x8, aes_cbc_enc_128_x8
+mbin_interface aes_cbc_enc_192
+mbin_dispatch_init aes_cbc_enc_192, aes_cbc_enc_192_x4, aes_cbc_enc_192_x8, aes_cbc_enc_192_x8
+mbin_interface aes_cbc_enc_256
+mbin_dispatch_init aes_cbc_enc_256, aes_cbc_enc_256_x4, aes_cbc_enc_256_x8, aes_cbc_enc_256_x8
+
+
+
+;;; func core, ver, snum
+slversion aes_cbc_enc_128, 00, 00, 0291
+slversion aes_cbc_dec_128, 00, 00, 0292
+slversion aes_cbc_enc_192, 00, 00, 0293
+slversion aes_cbc_dec_192, 00, 00, 0294
+slversion aes_cbc_enc_256, 00, 00, 0295
+slversion aes_cbc_dec_256, 00, 00, 0296
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c
new file mode 100644
index 000000000..7ae5c9078
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c
@@ -0,0 +1,339 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_cbc.h>
+#include <test.h>
+#include "ossl_helper.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static unsigned char const ic[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+ 0x0e, 0x0f
+};
+
+static unsigned char *plaintext, *cbc_plaintext, *cyphertext, *ossl_plaintext,
+ *ossl_cyphertext;
+static uint8_t test_key[CBC_256_BITS];
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ unsigned int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int aes_128_perf(uint8_t * key)
+{
+ int i, ret;
+
+ /* Initialize our cipher context, which can use same input vectors */
+ uint8_t *iv = NULL;
+ struct cbc_key_data *key_data = NULL;
+
+ ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ if ((NULL == iv) || (NULL == key_data))
+ return 1;
+
+ memcpy(iv, ic, CBC_IV_DATA_LEN);
+
+ aes_cbc_precomp(key, 128, key_data);
+ aes_cbc_enc_128(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+ openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_enc_128(plaintext, iv, key_data->enc_keys,
+ plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_128_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_128_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_dec_128(cyphertext, iv, key_data->dec_keys,
+ cbc_plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_128_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_128_cbc_dec(key, iv, TEST_LEN,
+ ossl_cyphertext, ossl_plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_128_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ printf("\n");
+ return 0;
+}
+
+int aes_192_perf(uint8_t * key)
+{
+ int i, ret;
+ uint8_t *iv = NULL;
+ struct cbc_key_data *key_data = NULL;
+
+ ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ if ((NULL == iv) || (NULL == key_data))
+ return 1;
+
+ memcpy(iv, ic, CBC_IV_DATA_LEN);
+ aes_cbc_precomp(key, 192, key_data);
+ aes_cbc_enc_192(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+ openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_enc_192(plaintext, iv, key_data->enc_keys,
+ cyphertext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_192_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_192_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_dec_192(cyphertext, iv, key_data->dec_keys,
+ cbc_plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_192_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_192_cbc_dec(key, iv, TEST_LEN,
+ ossl_cyphertext, ossl_plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_192_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ printf("\n");
+ return 0;
+}
+
+int aes_256_perf(uint8_t * key)
+{
+ int i, ret;
+ uint8_t *iv = NULL;
+ struct cbc_key_data *key_data = NULL;
+
+ ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ ret = posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+ if (ret) {
+ printf("alloc error: Fail");
+ return 1;
+ }
+ if ((NULL == iv) || (NULL == key_data))
+ return 1;
+
+ aes_cbc_precomp(key, 256, key_data);
+ memcpy(iv, ic, CBC_IV_DATA_LEN);
+ aes_cbc_enc_256(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+ openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_enc_256(plaintext, iv, key_data->enc_keys,
+ cyphertext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_256_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_256_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_dec_256(cyphertext, iv, key_data->dec_keys,
+ cbc_plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_256_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_cbc_dec(key, iv, TEST_LEN,
+ ossl_cyphertext, ossl_plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_256_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ printf("\n");
+ return 0;
+}
+
+int main(void)
+{
+ uint32_t OK = 0;
+
+ srand(TEST_SEED);
+
+ plaintext = malloc(TEST_LEN);
+ cbc_plaintext = malloc(TEST_LEN);
+ cyphertext = malloc(TEST_LEN);
+ ossl_plaintext = malloc(TEST_LEN);
+ ossl_cyphertext = malloc(TEST_LEN);
+ if (NULL == plaintext || NULL == cyphertext || NULL == cbc_plaintext
+ || NULL == ossl_plaintext || NULL == ossl_cyphertext) {
+ printf("malloc of testsize:0x%x failed\n", TEST_LEN);
+ return 1;
+ }
+
+ mk_rand_data(plaintext, TEST_LEN);
+ mk_rand_data(test_key, sizeof(test_key));
+ printf("AES CBC ISA-L vs OpenSSL performance:\n");
+ OK += aes_128_perf(test_key);
+ OK += aes_192_perf(test_key);
+ OK += aes_256_perf(test_key);
+
+ return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c
new file mode 100644
index 000000000..8e8f41792
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c
@@ -0,0 +1,56 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <aes_cbc.h>
+#include <aes_keyexp.h>
+
+int aes_cbc_precomp(uint8_t * key, int key_size, struct cbc_key_data *keys_blk)
+{
+ if (CBC_128_BITS == key_size) {
+ aes_keyexp_128(key, keys_blk->enc_keys, keys_blk->dec_keys);
+ } else if (CBC_192_BITS == key_size) {
+ aes_keyexp_192(key, keys_blk->enc_keys, keys_blk->dec_keys);
+ } else if (CBC_256_BITS == key_size) {
+ aes_keyexp_256(key, keys_blk->enc_keys, keys_blk->dec_keys);
+ } else {
+ //Invalid key length
+ return 1;
+ }
+ return 0;
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver aes_cbc_precomp_slver_00000297;
+struct slver aes_cbc_precomp_slver = { 0x0297, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h
new file mode 100644
index 000000000..7bebcaed4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h
@@ -0,0 +1,466 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_CBC_STD_VECTORS_H_
+#define AES_CBC_STD_VECTORS_H_
+#include <aes_cbc.h>
+
+
+// struct to hold pointers to the cbc data vectors
+struct cbc_vector {
+ uint8_t* K; // AES Key
+ cbc_key_size K_LEN; // length of key in bits
+ uint8_t* IV; // initial value used by GCM
+ uint64_t P_LEN; // length of our plaintext
+ uint8_t* P; // Plain text
+ //outputs of encryption
+ uint8_t* EXP_C; // same length as P
+ // used in vector checks, not populated in std vector array
+ uint8_t *C;
+ struct cbc_key_data *KEYS;
+};
+
+
+///////////////////////////////////////////
+// Test vectors from:
+// Intel IPSec library 1..3
+//
+///////////////////////////////////////////
+static unsigned char K1[] = {
+ 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c
+};
+static unsigned char IV1[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P1[] = {
+ 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+ 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+ 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+ 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C1[] = {
+ 0x76, 0x49, 0xab, 0xac, 0x81, 0x19, 0xb2, 0x46, 0xce, 0xe9, 0x8e, 0x9b, 0x12, 0xe9, 0x19, 0x7d,
+ 0x50, 0x86, 0xcb, 0x9b, 0x50, 0x72, 0x19, 0xee, 0x95, 0xdb, 0x11, 0x3a, 0x91, 0x76, 0x78, 0xb2,
+ 0x73, 0xbe, 0xd6, 0xb8, 0xe3, 0xc1, 0x74, 0x3b, 0x71, 0x16, 0xe6, 0x9e, 0x22, 0x22, 0x95, 0x16,
+ 0x3f, 0xf1, 0xca, 0xa1, 0x68, 0x1f, 0xac, 0x09, 0x12, 0x0e, 0xca, 0x30, 0x75, 0x86, 0xe1, 0xa7
+};
+
+static unsigned char K2[] = {
+ 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
+ 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4
+};
+static unsigned char IV2[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P2[] = {
+ 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+ 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+ 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+ 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C2[] = {
+ 0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6,
+ 0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d,
+ 0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61,
+ 0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b
+};
+
+static unsigned char K3[] = {
+ 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
+ 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7
+};
+static unsigned char IV3[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P3[] = {
+ 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+ 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+ 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+ 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C3[] = {
+ 0x17, 0x70, 0x1a, 0x9d, 0x29, 0xc9, 0x1a, 0x94, 0xce, 0xed, 0x72, 0x3c, 0x34, 0xe8,
+ 0x7a, 0xbe, 0x1c, 0x96, 0x84, 0x5c, 0xa8, 0xb7, 0xe8, 0x58, 0x6d, 0xfe, 0xf2, 0xfa,
+ 0x6b, 0xed, 0x24, 0x09, 0x8a, 0x52, 0xce, 0xe8, 0xd7, 0x6d, 0xb6, 0x7b, 0xfd, 0xe2,
+ 0x15, 0x53, 0xd3, 0x1c, 0x28, 0x33, 0xf7, 0x7e, 0xb5, 0x95, 0x00, 0xac, 0x49, 0x03,
+ 0xbc, 0x70, 0x76, 0xb1, 0x84, 0x65, 0xd0, 0xea
+};
+
+///////////////////////////////////////////
+// Test vectors from:
+// 'https://tools.ietf.org/html/rfc3602#section-3.2'
+// The AES-CBC Cipher Algorithm and Its Use with IPsec
+//
+///////////////////////////////////////////
+/*
+Case #1: Encrypting 16 bytes (1 block) using AES-CBC with 128-bit key
+Key : 0x06a9214036b8a15b512e03d534120006
+IV : 0x3dafba429d9eb430b422da802c9fac41
+Plaintext : "Single block msg"
+Ciphertext: 0xe353779c1079aeb82708942dbe77181a
+ *
+ */
+static unsigned char K4[] = {
+ 0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b, 0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06
+};
+static unsigned char IV4[] = {
+ 0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30, 0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41
+};
+static unsigned char P4[] = {
+ "Single block msg"
+};
+static unsigned char C4[] = {
+ 0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8, 0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a
+};
+
+/*
+Case #2: Encrypting 32 bytes (2 blocks) using AES-CBC with 128-bit key
+Key : 0xc286696d887c9aa0611bbb3e2025a45a
+IV : 0x562e17996d093d28ddb3ba695a2e6f58
+Plaintext : 0x000102030405060708090a0b0c0d0e0f
+ 101112131415161718191a1b1c1d1e1f
+Ciphertext: 0xd296cd94c2cccf8a3a863028b5e1dc0a
+ 7586602d253cfff91b8266bea6d61ab1
+*/
+static unsigned char K5[] = {
+ 0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0, 0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a
+};
+static unsigned char IV5[] = {
+ 0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28, 0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58
+};
+static unsigned char P5[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+ 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b,
+ 0x1c, 0x1d, 0x1e, 0x1f
+};
+static unsigned char C5[] = {
+ 0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a, 0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1,
+ 0xdc, 0x0a, 0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9, 0x1b, 0x82, 0x66, 0xbe,
+ 0xa6, 0xd6, 0x1a, 0xb1
+};
+
+/*
+Case #3: Encrypting 48 bytes (3 blocks) using AES-CBC with 128-bit key
+Key : 0x6c3ea0477630ce21a2ce334aa746c2cd
+IV : 0xc782dc4c098c66cbd9cd27d825682c81
+Plaintext : "This is a 48-byte message (exactly 3 AES blocks)"
+Ciphertext: 0xd0a02b3836451753d493665d33f0e886
+ 2dea54cdb293abc7506939276772f8d5
+ 021c19216bad525c8579695d83ba2684
+
+ */
+static unsigned char K6[] = {
+ 0x6c, 0x3e, 0xa0, 0x47, 0x76, 0x30, 0xce, 0x21, 0xa2, 0xce, 0x33, 0x4a, 0xa7, 0x46, 0xc2, 0xcd
+};
+static unsigned char IV6[] = {
+ 0xc7, 0x82, 0xdc, 0x4c, 0x09, 0x8c, 0x66, 0xcb, 0xd9, 0xcd, 0x27, 0xd8, 0x25, 0x68, 0x2c, 0x81
+};
+static unsigned char P6[] = {
+ "This is a 48-byte message (exactly 3 AES blocks)"
+};
+static unsigned char C6[] = {
+ 0xd0, 0xa0, 0x2b, 0x38, 0x36, 0x45, 0x17, 0x53, 0xd4, 0x93, 0x66, 0x5d, 0x33, 0xf0, 0xe8, 0x86,
+ 0x2d, 0xea, 0x54, 0xcd, 0xb2, 0x93, 0xab, 0xc7, 0x50, 0x69, 0x39, 0x27, 0x67, 0x72, 0xf8, 0xd5,
+ 0x02, 0x1c, 0x19, 0x21, 0x6b, 0xad, 0x52, 0x5c, 0x85, 0x79, 0x69, 0x5d, 0x83, 0xba, 0x26, 0x84
+};
+
+/*
+Case #4: Encrypting 64 bytes (4 blocks) using AES-CBC with 128-bit key
+Key : 0x56e47a38c5598974bc46903dba290349
+IV : 0x8ce82eefbea0da3c44699ed7db51b7d9
+Plaintext : 0xa0a1a2a3a4a5a6a7a8a9aaabacadaeaf
+ b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+ d0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+Ciphertext: 0xc30e32ffedc0774e6aff6af0869f71aa
+ 0f3af07a9a31a9c684db207eb0ef8e4e
+ 35907aa632c3ffdf868bb7b29d3d46ad
+ 83ce9f9a102ee99d49a53e87f4c3da55
+ */
+static unsigned char K7[] = {
+ 0x56, 0xe4, 0x7a, 0x38, 0xc5, 0x59, 0x89, 0x74, 0xbc, 0x46, 0x90, 0x3d, 0xba, 0x29, 0x03, 0x49
+};
+static unsigned char IV7[] = {
+ 0x8c, 0xe8, 0x2e, 0xef, 0xbe, 0xa0, 0xda, 0x3c, 0x44, 0x69, 0x9e, 0xd7, 0xdb, 0x51, 0xb7, 0xd9
+};
+static unsigned char P7[] = {
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf
+};
+static unsigned char C7[] = {
+ 0xc3, 0x0e, 0x32, 0xff, 0xed, 0xc0, 0x77, 0x4e, 0x6a, 0xff, 0x6a, 0xf0, 0x86, 0x9f, 0x71, 0xaa,
+ 0x0f, 0x3a, 0xf0, 0x7a, 0x9a, 0x31, 0xa9, 0xc6, 0x84, 0xdb, 0x20, 0x7e, 0xb0, 0xef, 0x8e, 0x4e,
+ 0x35, 0x90, 0x7a, 0xa6, 0x32, 0xc3, 0xff, 0xdf, 0x86, 0x8b, 0xb7, 0xb2, 0x9d, 0x3d, 0x46, 0xad,
+ 0x83, 0xce, 0x9f, 0x9a, 0x10, 0x2e, 0xe9, 0x9d, 0x49, 0xa5, 0x3e, 0x87, 0xf4, 0xc3, 0xda, 0x55
+};
+
+/*
+Case #5: Sample transport-mode ESP packet (ping 192.168.123.100)
+Key: 90d382b4 10eeba7a d938c46c ec1a82bf
+SPI: 4321
+Source address: 192.168.123.3
+Destination address: 192.168.123.100
+Sequence number: 1
+IV: e96e8c08 ab465763 fd098d45 dd3ff893
+
+Original packet:
+IP header (20 bytes): 45000054 08f20000 4001f9fe c0a87b03 c0a87b64
+Data (64 bytes):
+08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+
+Augment data with:
+Padding: 01020304 05060708 090a0b0c 0d0e
+Pad length: 0e
+Next header: 01 (ICMP)
+
+Pre-encryption Data with padding, pad length and next header (80 bytes):
+08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+01020304 05060708 090a0b0c 0d0e0e01
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500007c 08f20000 4032f9a5 c0a87b03 c0a87b64
+SPI/Seq #: 00004321 00000001
+IV: e96e8c08 ab465763 fd098d45 dd3ff893
+Encrypted Data (80 bytes):
+f663c25d 325c18c6 a9453e19 4e120849 a4870b66 cc6b9965 330013b4 898dc856
+a4699e52 3a55db08 0b59ec3a 8e4b7e52 775b07d1 db34ed9c 538ab50c 551b874a
+a269add0 47ad2d59 13ac19b7 cfbad4a6
+*/
+static unsigned char K8[] = {
+ 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf
+};
+static unsigned char IV8[] = {
+ 0xe9, 0x6e, 0x8c, 0x08, 0xab, 0x46, 0x57, 0x63, 0xfd, 0x09, 0x8d, 0x45, 0xdd, 0x3f, 0xf8, 0x93
+};
+static unsigned char P8[] = {
+ 0x08, 0x00, 0x0e, 0xbd, 0xa7, 0x0a, 0x00, 0x00, 0x8e, 0x9c, 0x08, 0x3d, 0xb9, 0x5b, 0x07, 0x00,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x01
+};
+static unsigned char C8[] = {
+ 0xf6, 0x63, 0xc2, 0x5d, 0x32, 0x5c, 0x18, 0xc6, 0xa9, 0x45, 0x3e, 0x19, 0x4e, 0x12, 0x08, 0x49,
+ 0xa4, 0x87, 0x0b, 0x66, 0xcc, 0x6b, 0x99, 0x65, 0x33, 0x00, 0x13, 0xb4, 0x89, 0x8d, 0xc8, 0x56,
+ 0xa4, 0x69, 0x9e, 0x52, 0x3a, 0x55, 0xdb, 0x08, 0x0b, 0x59, 0xec, 0x3a, 0x8e, 0x4b, 0x7e, 0x52,
+ 0x77, 0x5b, 0x07, 0xd1, 0xdb, 0x34, 0xed, 0x9c, 0x53, 0x8a, 0xb5, 0x0c, 0x55, 0x1b, 0x87, 0x4a,
+ 0xa2, 0x69, 0xad, 0xd0, 0x47, 0xad, 0x2d, 0x59, 0x13, 0xac, 0x19, 0xb7, 0xcf, 0xba, 0xd4, 0xa6
+};
+
+/*
+Case #6: Sample transport-mode ESP packet
+ (ping -p 77 -s 20 192.168.123.100)
+Key: 90d382b4 10eeba7a d938c46c ec1a82bf
+SPI: 4321
+Source address: 192.168.123.3
+Destination address: 192.168.123.100
+Sequence number: 8
+IV: 69d08df7 d203329d b093fc49 24e5bd80
+
+Original packet:
+IP header (20 bytes): 45000030 08fe0000 4001fa16 c0a87b03 c0a87b64
+Data (28 bytes):
+0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777
+
+Augment data with:
+Padding: 0102
+Pad length: 02
+Next header: 01 (ICMP)
+
+Pre-encryption Data with padding, pad length and next header (32 bytes):
+0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777 01020201
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500004c 08fe0000 4032f9c9 c0a87b03 c0a87b64
+SPI/Seq #: 00004321 00000008
+IV: 69d08df7 d203329d b093fc49 24e5bd80
+Encrypted Data (32 bytes):
+f5199588 1ec4e0c4 488987ce 742e8109 689bb379 d2d750c0 d915dca3 46a89f75
+ */
+static unsigned char K9[] = {
+ 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf
+};
+static unsigned char IV9[] = {
+ 0x69, 0xd0, 0x8d, 0xf7, 0xd2, 0x03, 0x32, 0x9d, 0xb0, 0x93, 0xfc, 0x49, 0x24, 0xe5, 0xbd, 0x80
+};
+static unsigned char P9[] = {
+ 0x08, 0x00, 0xb5, 0xe8, 0xa8, 0x0a, 0x05, 0x00, 0xa6, 0x9c, 0x08, 0x3d, 0x0b, 0x66, 0x0e, 0x00,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x01, 0x02, 0x02, 0x01
+};
+static unsigned char C9[] = {
+ 0xf5, 0x19, 0x95, 0x88, 0x1e, 0xc4, 0xe0, 0xc4, 0x48, 0x89, 0x87, 0xce, 0x74, 0x2e, 0x81, 0x09,
+ 0x68, 0x9b, 0xb3, 0x79, 0xd2, 0xd7, 0x50, 0xc0, 0xd9, 0x15, 0xdc, 0xa3, 0x46, 0xa8, 0x9f, 0x75
+};
+
+/*
+Case #7: Sample tunnel-mode ESP packet (ping 192.168.123.200)
+Key: 01234567 89abcdef 01234567 89abcdef
+SPI: 8765
+Source address: 192.168.123.3
+Destination address: 192.168.123.200
+Sequence number: 2
+IV: f4e76524 4f6407ad f13dc138 0f673f37
+
+Original packet:
+IP header (20 bytes): 45000054 09040000 4001f988 c0a87b03 c0a87bc8
+Data (64 bytes):
+08009f76 a90a0100 b49c083d 02a20400 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+
+Augment data with:
+Padding: 01020304 05060708 090a
+Pad length: 0a
+Next header: 04 (IP-in-IP)
+
+Pre-encryption Data with original IP header, padding, pad length and
+ next header (96 bytes):
+45000054 09040000 4001f988 c0a87b03 c0a87bc8 08009f76 a90a0100 b49c083d
+02a20400 08090a0b 0c0d0e0f 10111213 14151617 18191a1b 1c1d1e1f 20212223
+24252627 28292a2b 2c2d2e2f 30313233 34353637 01020304 05060708 090a0a04
+
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500008c 09050000 4032f91e c0a87b03 c0a87bc8
+SPI/Seq #: 00008765 00000002
+IV: f4e76524 4f6407ad f13dc138 0f673f37
+Encrypted Data (96 bytes):
+773b5241 a4c44922 5e4f3ce5 ed611b0c 237ca96c f74a9301 3c1b0ea1 a0cf70f8
+e4ecaec7 8ac53aad 7a0f022b 859243c6 47752e94 a859352b 8a4d4d2d ecd136e5
+c177f132 ad3fbfb2 201ac990 4c74ee0a 109e0ca1 e4dfe9d5 a100b842 f1c22f0d
+ */
+static unsigned char K10[] = {
+ 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef
+};
+static unsigned char IV10[] = {
+ 0xf4, 0xe7, 0x65, 0x24, 0x4f, 0x64, 0x07, 0xad, 0xf1, 0x3d, 0xc1, 0x38, 0x0f, 0x67, 0x3f, 0x37
+};
+static unsigned char P10[] = {
+ 0x45, 0x00, 0x00, 0x54, 0x09, 0x04, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x88, 0xc0, 0xa8, 0x7b, 0x03,
+ 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0x9f, 0x76, 0xa9, 0x0a, 0x01, 0x00, 0xb4, 0x9c, 0x08, 0x3d,
+ 0x02, 0xa2, 0x04, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
+ 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33,
+ 0x34, 0x35, 0x36, 0x37, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04
+
+};
+static unsigned char C10[] = {
+ 0x77, 0x3b, 0x52, 0x41, 0xa4, 0xc4, 0x49, 0x22, 0x5e, 0x4f, 0x3c, 0xe5, 0xed, 0x61, 0x1b, 0x0c,
+ 0x23, 0x7c, 0xa9, 0x6c, 0xf7, 0x4a, 0x93, 0x01, 0x3c, 0x1b, 0x0e, 0xa1, 0xa0, 0xcf, 0x70, 0xf8,
+ 0xe4, 0xec, 0xae, 0xc7, 0x8a, 0xc5, 0x3a, 0xad, 0x7a, 0x0f, 0x02, 0x2b, 0x85, 0x92, 0x43, 0xc6,
+ 0x47, 0x75, 0x2e, 0x94, 0xa8, 0x59, 0x35, 0x2b, 0x8a, 0x4d, 0x4d, 0x2d, 0xec, 0xd1, 0x36, 0xe5,
+ 0xc1, 0x77, 0xf1, 0x32, 0xad, 0x3f, 0xbf, 0xb2, 0x20, 0x1a, 0xc9, 0x90, 0x4c, 0x74, 0xee, 0x0a,
+ 0x10, 0x9e, 0x0c, 0xa1, 0xe4, 0xdf, 0xe9, 0xd5, 0xa1, 0x00, 0xb8, 0x42, 0xf1, 0xc2, 0x2f, 0x0d
+};
+
+/*
+Case #8: Sample tunnel-mode ESP packet
+ (ping -p ff -s 40 192.168.123.200)
+Key: 01234567 89abcdef 01234567 89abcdef
+SPI: 8765
+Source address: 192.168.123.3
+Destination address: 192.168.123.200
+Sequence number: 5
+IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22
+
+Original packet:
+IP header (20 bytes): 45000044 090c0000 4001f990 c0a87b03 c0a87bc8
+Data (48 bytes):
+0800d63c aa0a0200 c69c083d a3de0300 ffffffff ffffffff ffffffff ffffffff
+ffffffff ffffffff ffffffff ffffffff
+
+Augment data with:
+Padding: 01020304 05060708 090a
+Pad length: 0a
+Next header: 04 (IP-in-IP)
+
+Pre-encryption Data with original IP header, padding, pad length and
+ next header (80 bytes):
+45000044 090c0000 4001f990 c0a87b03 c0a87bc8 0800d63c aa0a0200 c69c083d
+a3de0300 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff
+ffffffff 01020304 05060708 090a0a04
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500007c 090d0000 4032f926 c0a87b03 c0a87bc8
+SPI/Seq #: 00008765 00000005
+IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22
+Encrypted Data (80 bytes):
+15b92683 819596a8 047232cc 00f7048f e45318e1 1f8a0f62 ede3c3fc 61203bb5
+0f980a08 c9843fd3 a1b06d5c 07ff9639 b7eb7dfb 3512e5de 435e7207 ed971ef3
+d2726d9b 5ef6affc 6d17a0de cbb13892
+ */
+static unsigned char K11[] = {
+ 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef
+};
+static unsigned char IV11[] = {
+ 0x85, 0xd4, 0x72, 0x24, 0xb5, 0xf3, 0xdd, 0x5d, 0x21, 0x01, 0xd4, 0xea, 0x8d, 0xff, 0xab, 0x22
+};
+static unsigned char P11[] = {
+ 0x45, 0x00, 0x00, 0x44, 0x09, 0x0c, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x90, 0xc0, 0xa8, 0x7b, 0x03,
+ 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0xd6, 0x3c, 0xaa, 0x0a, 0x02, 0x00, 0xc6, 0x9c, 0x08, 0x3d,
+ 0xa3, 0xde, 0x03, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04
+};
+static unsigned char C11[] = {
+ 0x15, 0xb9, 0x26, 0x83, 0x81, 0x95, 0x96, 0xa8, 0x04, 0x72, 0x32, 0xcc, 0x00, 0xf7, 0x04, 0x8f,
+ 0xe4, 0x53, 0x18, 0xe1, 0x1f, 0x8a, 0x0f, 0x62, 0xed, 0xe3, 0xc3, 0xfc, 0x61, 0x20, 0x3b, 0xb5,
+ 0x0f, 0x98, 0x0a, 0x08, 0xc9, 0x84, 0x3f, 0xd3, 0xa1, 0xb0, 0x6d, 0x5c, 0x07, 0xff, 0x96, 0x39,
+ 0xb7, 0xeb, 0x7d, 0xfb, 0x35, 0x12, 0xe5, 0xde, 0x43, 0x5e, 0x72, 0x07, 0xed, 0x97, 0x1e, 0xf3,
+ 0xd2, 0x72, 0x6d, 0x9b, 0x5e, 0xf6, 0xaf, 0xfc, 0x6d, 0x17, 0xa0, 0xde, 0xcb, 0xb1, 0x38, 0x92
+};
+
+
+#define min_size(a, b) (((a)<(b))?(a):(b))
+// Plain and cypher text will be the same size
+// Those vectors using strings for plain text have an extra null terminator that needs
+// to be ignored
+#define vect_size(P, C) (min_size((sizeof(P)),(sizeof(C))))
+#define CBC_KEY_LEN(kdata) (sizeof(kdata))
+
+//field order {K, Klen, IV, Plen, P, C};
+#define vector(N) {K##N, (CBC_KEY_LEN(K##N)), IV##N, vect_size(P##N,C##N), P##N, C##N, NULL, NULL, /*NULL, NULL*/}
+struct cbc_vector const cbc_vectors[] = {
+ vector(1),
+ vector(2),
+ vector(3),
+ vector(4),
+ vector(5),
+ vector(6),
+ vector(7),
+ vector(8),
+ vector(9),
+ vector(10),
+ vector(11),
+};
+
+#endif /* AES_CBC_STD_VECTORS_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c
new file mode 100644
index 000000000..aa9412c35
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c
@@ -0,0 +1,443 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <aes_cbc.h>
+#include "types.h"
+#include "ossl_helper.h"
+#include "cbc_std_vectors.h"
+
+//define CBC_VECTORS_VERBOSE
+//define CBC_VECTORS_EXTRA_VERBOSE
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS 100
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN (8*1024*1024)
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN (4*1024)
+#endif
+#ifndef MAX_UNALINED
+# define MAX_UNALINED (16)
+#endif
+
+static cbc_key_size const Ksize[] = { CBC_128_BITS, CBC_192_BITS, CBC_256_BITS };
+
+typedef void (*aes_cbc_generic)(uint8_t * in,
+ uint8_t * IV,
+ uint8_t * keys, uint8_t * out, uint64_t len_bytes);
+
+int OpenSslEnc(uint8_t k_len,
+ uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes)
+{
+ if (CBC_128_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL128 ");
+#endif
+ openssl_aes_128_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_192_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL192 ");
+#endif
+ openssl_aes_192_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_256_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL256 ");
+ fflush(0);
+#endif
+ openssl_aes_256_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+ } else {
+ fprintf(stderr, "Invalid key length: %d\n", k_len);
+ return 1;
+ }
+ return 0;
+}
+
+int OpenSslDec(uint8_t k_len,
+ uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes)
+{
+ if (CBC_128_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL128 ");
+#endif
+ openssl_aes_128_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_192_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL192 ");
+#endif
+ openssl_aes_192_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_256_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL256 ");
+#endif
+ openssl_aes_256_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+ } else {
+ fprintf(stderr, "Invalid key length: %d\n", k_len);
+ return 1;
+ }
+ return 0;
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+ uint64_t a;
+
+ mismatch = memcmp(test, expected, len);
+ if (!mismatch) {
+ return OK;
+
+ } else {
+ OK = 1;
+ printf(" failed %s \t\t", data_name);
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct cbc_vector *vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ int OK = 0;
+ aes_cbc_generic enc;
+ aes_cbc_generic dec;
+
+#ifdef CBC_VECTORS_VERBOSE
+ printf(" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN);
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C,
+ vector->IV, vector->EXP_C, vector->KEYS);
+#endif
+ fflush(0);
+#else
+ printf(".");
+#endif
+
+ if (CBC_128_BITS == vector->K_LEN) {
+ enc = (aes_cbc_generic) & aes_cbc_enc_128;
+ dec = (aes_cbc_generic) & aes_cbc_dec_128;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" CBC128 ");
+#endif
+ } else if (CBC_192_BITS == vector->K_LEN) {
+ enc = (aes_cbc_generic) & aes_cbc_enc_192;
+ dec = (aes_cbc_generic) & aes_cbc_dec_192;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" CBC192 ");
+#endif
+ } else if (CBC_256_BITS == vector->K_LEN) {
+ enc = (aes_cbc_generic) & aes_cbc_enc_256;
+ dec = (aes_cbc_generic) & aes_cbc_dec_256;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" CBC256 ");
+#endif
+ } else {
+ printf("Invalid key length: %d\n", vector->K_LEN);
+ return 1;
+ }
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->P_LEN);
+ o_ct_test = malloc(vector->P_LEN);
+ if ((pt_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+
+ aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS);
+
+#ifdef CBC_VECTORS_VERBOSE
+ fflush(0);
+#endif
+ ////
+ // ISA-l Encrypt
+ ////
+ enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN);
+ if (NULL != vector->EXP_C) { //when the encrypted text is know verify correct
+ OK |=
+ check_data(vector->EXP_C, vector->C, vector->P_LEN,
+ "ISA-L expected cypher text (C)");
+ }
+ OpenSslEnc(vector->K_LEN, vector->K, vector->P, vector->IV, o_ct_test, vector->P_LEN);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->P_LEN,
+ "OpenSSL vs ISA-L cypher text (C)");
+
+ memcpy(pt_test, vector->P, vector->P_LEN);
+ memset(vector->P, 0, vector->P_LEN);
+#ifdef CBC_VECTORS_VERBOSE
+ fflush(0);
+#endif
+
+ ////
+ // ISA-l Decrypt
+ ////
+ dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->P_LEN);
+ dec(o_ct_test, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted OpenSSL (P)");
+ memset(vector->P, 0, vector->P_LEN);
+ OpenSslDec(vector->K_LEN, vector->K, vector->C, vector->IV, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "OpenSSL decrypted ISA-L (P)");
+#ifdef CBC_VECTORS_VERBOSE
+ if (OK)
+ printf("Failed");
+ else
+ printf("Passed");
+
+ printf("\n");
+#endif
+
+ return OK;
+}
+
+int test_std_combinations(void)
+{
+ int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]);
+ int i, ret;
+ uint8_t *iv = NULL;
+
+ printf("AES CBC standard test vectors:");
+#ifdef CBC_VECTORS_VERBOSE
+ printf("\n");
+#endif
+ ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ if ((0 != ret) || (NULL == iv))
+ return 1;
+
+ for (i = 0; (i < vectors_cnt); i++) {
+ struct cbc_vector vect = cbc_vectors[i];
+
+ ret = posix_memalign((void **)&vect.KEYS, 16, (sizeof(*vect.KEYS)));
+ if ((0 != ret) || (NULL == vect.KEYS))
+ return 1;
+ // IV data must be aligned to 16 byte boundary so move data in aligned buffer and change out the pointer
+ memcpy(iv, vect.IV, CBC_IV_DATA_LEN);
+ vect.IV = iv;
+ vect.C = NULL;
+ vect.C = malloc(vect.P_LEN);
+ if ((NULL == vect.C))
+ return 1;
+#ifdef CBC_VECTORS_VERBOSE
+ printf("vector[%d of %d] ", i, vectors_cnt);
+#endif
+ if (0 == (i % 25))
+ printf("\n");
+ if (0 == (i % 10))
+ fflush(0);
+
+ if (0 != check_vector(&vect))
+ return 1;
+
+ aligned_free(vect.KEYS);
+ free(vect.C);
+ }
+
+ aligned_free(iv);
+ printf("\n");
+ return 0;
+}
+
+int test_random_combinations(void)
+{
+ struct cbc_vector test;
+ int t, ret;
+
+ printf("AES CBC random test vectors:");
+#ifdef CBC_VECTORS_VERBOSE
+ fflush(0);
+#endif
+ test.IV = NULL;
+ ret = posix_memalign((void **)&test.IV, 16, (CBC_IV_DATA_LEN));
+ if ((0 != ret) || (NULL == test.IV))
+ return 1;
+ test.KEYS = NULL;
+ ret = posix_memalign((void **)&test.KEYS, 16, (sizeof(*test.KEYS)));
+ if ((0 != ret) || (NULL == test.KEYS))
+ return 1;
+
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = 16 + ((rand() % TEST_LEN) & ~0xf); //must be a 16byte multiple
+ int offset = (rand() % MAX_UNALINED);
+ int Kindex = (rand() % (sizeof(Ksize) / sizeof(Ksize[0]))); // select one of the valid key sizes
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+
+ test.C = NULL;
+ test.P = NULL;
+ test.K = NULL;
+ test.EXP_C = NULL;
+ test.P_LEN = Plen;
+ test.K_LEN = Ksize[Kindex];
+
+ test.P = malloc(test.P_LEN + offset);
+ test.C = malloc(test.P_LEN + offset);
+ test.K = malloc(test.K_LEN + offset);
+ if ((NULL == test.P) || (NULL == test.C) || (NULL == test.K)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return -1;
+ }
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+
+ mk_rand_data(test.P, test.P_LEN);
+ mk_rand_data(test.K, test.K_LEN);
+ mk_rand_data(test.IV, CBC_IV_DATA_LEN);
+
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" Offset:0x%x ", offset);
+#endif
+ if (0 != check_vector(&test))
+ return 1;
+
+ test.C -= offset;
+ free(test.C);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ }
+
+ aligned_free(test.IV);
+ aligned_free(test.KEYS);
+ printf("\n");
+ return 0;
+}
+
+int test_efence_combinations(void)
+{
+ struct cbc_vector test;
+ int offset = 0;
+ int key_idx;
+ uint8_t *P = NULL, *C = NULL, *K = NULL, *IV = NULL;
+ uint8_t *key_data = NULL;
+
+ P = malloc(PAGE_LEN);
+ C = malloc(PAGE_LEN);
+ K = malloc(PAGE_LEN);
+ IV = malloc(PAGE_LEN);
+ key_data = malloc(PAGE_LEN);
+
+ if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV)
+ || (NULL == key_data)
+ ) {
+ printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+ return -1;
+ }
+ // place buffers to end at page boundary
+ test.P_LEN = PAGE_LEN / 2;
+ test.EXP_C = NULL;
+
+ printf("AES CBC efence test vectors:");
+ for (key_idx = 0; key_idx < (sizeof(Ksize) / sizeof(Ksize[0])); key_idx++) {
+ test.K_LEN = Ksize[key_idx];
+
+ for (offset = 0; MAX_UNALINED > offset; offset++) {
+ if (0 == (offset % 80))
+ printf("\n");
+ // move the start and size of the data block towards the end of the page
+ test.P_LEN = ((PAGE_LEN / (1 + (2 * offset))) & ~0xff); // must be a multiple of 16
+ if (16 > test.P_LEN)
+ test.P_LEN = 16;
+ //Place data at end of page
+ test.P = P + PAGE_LEN - test.P_LEN - offset;
+ test.C = C + PAGE_LEN - test.P_LEN - offset;
+ test.K = K + PAGE_LEN - test.K_LEN - offset;
+ test.IV = IV + PAGE_LEN - CBC_IV_DATA_LEN - offset;
+ test.IV = test.IV - ((uint64_t) test.IV & 0xff); // align to 16 byte boundary
+ test.KEYS = (struct cbc_key_data *)
+ (key_data + PAGE_LEN - sizeof(*test.KEYS) - offset);
+ test.KEYS = (struct cbc_key_data *)
+ ((uint8_t *) test.KEYS - ((uint64_t) test.KEYS & 0xff)); // align to 16 byte boundary
+
+ mk_rand_data(test.P, test.P_LEN);
+ mk_rand_data(test.K, test.K_LEN);
+ mk_rand_data(test.IV, CBC_IV_DATA_LEN);
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" Offset:0x%x ", offset);
+#endif
+ if (0 != check_vector(&test))
+ return 1;
+ }
+
+ }
+
+ free(P);
+ free(C);
+ free(K);
+ free(IV);
+ free(key_data);
+ printf("\n");
+ return 0;
+}
+
+int main(void)
+{
+ uint32_t OK = 0;
+
+ srand(TEST_SEED);
+ OK |= test_std_combinations();
+ OK |= test_random_combinations();
+ OK |= test_efence_combinations();
+ if (0 == OK) {
+ printf("...Pass\n");
+ } else {
+ printf("...Fail\n");
+ }
+ return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c
new file mode 100644
index 000000000..0558b4254
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c
@@ -0,0 +1,183 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * Run list of standard CBC test vectors through encode and decode checks.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <aes_cbc.h>
+#include "types.h"
+#include "cbc_std_vectors.h"
+
+typedef void (*aes_cbc_generic)(uint8_t * in, uint8_t * IV, uint8_t * keys, uint8_t * out,
+ uint64_t len_bytes);
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+ uint64_t a;
+
+ mismatch = memcmp(test, expected, len);
+ if (!mismatch) {
+ return OK;
+
+ } else {
+ OK = 1;
+ printf(" failed %s \t\t", data_name);
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct cbc_vector *vector)
+{
+ uint8_t *pt_test = NULL;
+ int OK = 0;
+ aes_cbc_generic enc;
+ aes_cbc_generic dec;
+
+ DEBUG_PRINT((" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN));
+ DEBUG_PRINT((" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C,
+ vector->IV, vector->EXP_C, vector->KEYS));
+ printf(".");
+
+ switch (vector->K_LEN) {
+ case CBC_128_BITS:
+ enc = (aes_cbc_generic) & aes_cbc_enc_128;
+ dec = (aes_cbc_generic) & aes_cbc_dec_128;
+ DEBUG_PRINT((" CBC128 "));
+ break;
+ case CBC_192_BITS:
+ enc = (aes_cbc_generic) & aes_cbc_enc_192;
+ dec = (aes_cbc_generic) & aes_cbc_dec_192;
+ DEBUG_PRINT((" CBC192 "));
+ break;
+ case CBC_256_BITS:
+ enc = (aes_cbc_generic) & aes_cbc_enc_256;
+ dec = (aes_cbc_generic) & aes_cbc_dec_256;
+ DEBUG_PRINT((" CBC256 "));
+ break;
+ default:
+ printf("Invalid key length: %d\n", vector->K_LEN);
+ return 1;
+ }
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->P_LEN);
+
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+
+ aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN);
+
+ if (NULL != vector->EXP_C) { //when the encrypted text is known verify correct
+ OK |= check_data(vector->EXP_C, vector->C, vector->P_LEN,
+ "ISA-L expected cypher text (C)");
+ }
+ memcpy(pt_test, vector->P, vector->P_LEN);
+ memset(vector->P, 0, vector->P_LEN);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)");
+ DEBUG_PRINT((OK ? "Failed\n" : "Passed\n"));
+
+ free(pt_test);
+ return OK;
+}
+
+int test_std_combinations(void)
+{
+ int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]);
+ int i, ret;
+ uint8_t *iv = NULL;
+
+ printf("AES CBC standard test vectors: ");
+
+ ret = posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ if ((0 != ret) || (NULL == iv))
+ return 1;
+
+ for (i = 0; (i < vectors_cnt); i++) {
+ struct cbc_vector vect = cbc_vectors[i];
+
+ ret = posix_memalign((void **)&(vect.KEYS), 16, sizeof(*vect.KEYS));
+ if ((0 != ret) || (NULL == vect.KEYS))
+ return 1;
+
+ // IV data must be aligned to 16 byte boundary so move data in
+ // aligned buffer and change out the pointer
+ memcpy(iv, vect.IV, CBC_IV_DATA_LEN);
+ vect.IV = iv;
+ vect.C = malloc(vect.P_LEN);
+ if (NULL == vect.C)
+ return 1;
+
+ DEBUG_PRINT(("vector[%d of %d] ", i, vectors_cnt));
+
+ if (0 != check_vector(&vect))
+ return 1;
+
+ aligned_free(vect.KEYS);
+ free(vect.C);
+ }
+
+ aligned_free(iv);
+ return 0;
+}
+
+int main(void)
+{
+ uint32_t OK = 0;
+
+ OK = test_std_combinations();
+
+ printf(0 == OK ? "Pass\n" : "Fail\n");
+ return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm b/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm
new file mode 100644
index 000000000..2c80401e9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/clear_regs.asm
@@ -0,0 +1,202 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _CLEAR_REGS_ASM_
+%define _CLEAR_REGS_ASM_
+
+%ifndef LINUX
+%ifidn __OUTPUT_FORMAT__, elf64
+%define LINUX
+%endif
+%endif
+
+;
+; This macro clears any GP registers passed
+;
+%macro clear_gps 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ xor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any XMM registers passed on SSE
+;
+%macro clear_xmms_sse 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ pxor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any XMM registers passed on AVX
+;
+%macro clear_xmms_avx 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ vpxor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any YMM registers passed
+;
+%macro clear_ymms 1-16
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ vpxor %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears any ZMM registers passed
+;
+%macro clear_zmms 1-32
+%define %%NUM_REGS %0
+%rep %%NUM_REGS
+ vpxorq %1, %1
+%rotate 1
+%endrep
+%endmacro
+
+;
+; This macro clears all scratch GP registers
+; for Windows or Linux
+;
+%macro clear_scratch_gps_asm 0
+ clear_gps rax, rcx, rdx, r8, r9, r10, r11
+%ifdef LINUX
+ clear_gps rdi, rsi
+%endif
+%endmacro
+
+;
+; This macro clears all scratch XMM registers on SSE
+;
+%macro clear_scratch_xmms_sse_asm 0
+%ifdef LINUX
+%assign i 0
+%rep 16
+ pxor xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+; On Windows, XMM0-XMM5 registers are scratch registers
+%else
+%assign i 0
+%rep 6
+ pxor xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch XMM registers on AVX
+;
+%macro clear_scratch_xmms_avx_asm 0
+%ifdef LINUX
+ vzeroall
+; On Windows, XMM0-XMM5 registers are scratch registers
+%else
+%assign i 0
+%rep 6
+ vpxor xmm %+ i, xmm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch YMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15)
+;
+%macro clear_scratch_ymms_asm 0
+; On Linux, all YMM registers are scratch registers
+%ifdef LINUX
+ vzeroall
+; On Windows, YMM0-YMM5 registers are scratch registers.
+; YMM6-YMM15 upper 128 bits are scratch registers too, but
+; the lower 128 bits are to be restored after calling these function
+; which clears the upper bits too.
+%else
+%assign i 0
+%rep 6
+ vpxor ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+;
+; This macro clears all scratch ZMM registers
+;
+; It should be called before restoring the XMM registers
+; for Windows (XMM6-XMM15). YMM registers are used
+; on purpose, since XOR'ing YMM registers is faster
+; than XOR'ing ZMM registers, and the operation clears
+; also the upper 256 bits
+;
+%macro clear_scratch_zmms_asm 0
+; On Linux, all ZMM registers are scratch registers
+%ifdef LINUX
+ vzeroall
+ ;; vzeroall only clears the first 16 ZMM registers
+%assign i 16
+%rep 16
+ vpxorq ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+; On Windows, ZMM0-ZMM5 and ZMM16-ZMM31 registers are scratch registers.
+; ZMM6-ZMM15 upper 384 bits are scratch registers too, but
+; the lower 128 bits are to be restored after calling these function
+; which clears the upper bits too.
+%else
+%assign i 0
+%rep 6
+ vpxorq ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+
+%assign i 16
+%rep 16
+ vpxorq ymm %+ i, ymm %+ i
+%assign i (i+1)
+%endrep
+%endif ; LINUX
+%endmacro
+
+%endif ;; _CLEAR_REGS_ASM
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm
new file mode 100644
index 000000000..98304c552
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "gcm_avx_gen2.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm
new file mode 100644
index 000000000..5ee5e7b48
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_avx_gen2.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm
new file mode 100644
index 000000000..902c17237
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "gcm_avx_gen4.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm
new file mode 100644
index 000000000..1e55d24cf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_avx_gen4.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm
new file mode 100644
index 000000000..1717a8662
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%include "gcm_sse.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm
new file mode 100644
index 000000000..d17402bea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_sse.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm
new file mode 100644
index 000000000..71f284789
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+;; single buffer implementation
+%include "gcm_vaes_avx512.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm
new file mode 100644
index 000000000..c0c587133
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_vaes_avx512_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM128_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_vaes_avx512.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm
new file mode 100644
index 000000000..4b159cefb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "gcm_avx_gen2.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm
new file mode 100644
index 000000000..822ef07cc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_avx_gen2.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm
new file mode 100644
index 000000000..f6050a8ff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "gcm_avx_gen4.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm
new file mode 100644
index 000000000..5959d698f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_avx_gen4.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm
new file mode 100644
index 000000000..c583d02b8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm
@@ -0,0 +1,31 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%include "gcm_sse.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm
new file mode 100644
index 000000000..5952a6005
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_sse.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm
new file mode 100644
index 000000000..bd318fcd1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512.asm
@@ -0,0 +1,32 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+;; single buffer implementation
+%include "gcm_vaes_avx512.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm
new file mode 100644
index 000000000..da2f611b4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_vaes_avx512_nt.asm
@@ -0,0 +1,33 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define GCM256_MODE 1
+%define NT_LDST
+%define FUNCT_EXTENSION _nt
+%include "gcm_vaes_avx512.asm"
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm
new file mode 100644
index 000000000..90db18910
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen2.asm
@@ -0,0 +1,2130 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx_gen2.asm!"
+%endif
+%endif
+%endif
+
+%ifndef FUNCT_EXTENSION
+%define FUNCT_EXTENSION
+%endif
+
+%ifdef GCM128_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION
+%define NROUNDS 9
+%endif
+
+%ifdef GCM192_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION
+%define NROUNDS 11
+%endif
+
+%ifdef GCM256_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen2 %+ FUNCT_EXTENSION
+%define NROUNDS 13
+%endif
+
+default rel
+; need to push 5 registers into stack to maintain
+%define STACK_OFFSET 8*5
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba
+ vpshufd %%T2, %%GH, 01001110b
+ vpshufd %%T3, %%HK, 01001110b
+ vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
+ vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ vpxor %%T2, %%T2, %%GH
+ vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
+ vpxor %%GH, %%GH, %%T3
+ vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
+
+ ;first phase of the reduction
+ vpslld %%T2, %%GH, 31 ; packed right shifting << 31
+ vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpxor %%T2, %%T2, %%T5
+ vpxor %%GH, %%GH, %%T2
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input:
+; GDATA_KEY - struct gcm_key_data *
+; GDATA_CTX - struct gcm_context_data *
+; PLAIN_CYPH_IN - input text
+; PLAIN_CYPH_LEN - input text length
+; DATA_OFFSET - the current data offset
+; ENC_DEC - whether encoding or decoding
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 8
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%DATA_OFFSET %6
+%define %%AAD_HASH %7
+%define %%ENC_DEC %8
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ cmp r13, rax
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA_CTX+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 24
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%LENGTH %5
+%define %%DATA_OFFSET %6
+%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %8
+%define %%HASH_KEY %9
+%define %%T3 %10
+%define %%T4 %11
+%define %%T5 %12
+%define %%CTR %13
+%define %%XMM1 %14
+%define %%XMM2 %15
+%define %%XMM3 %16
+%define %%XMM4 %17
+%define %%XMM5 %18
+%define %%XMM6 %19
+%define %%XMM7 %20
+%define %%XMM8 %21
+%define %%T6 %22
+%define %%T_key %23
+%define %%ENC_DEC %24
+
+%assign i (8-%%num_initial_blocks)
+ vmovdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+ vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep ; NROUNDS
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+ vpxor %%XMM1, %%T_key
+ vpxor %%XMM2, %%T_key
+ vpxor %%XMM3, %%T_key
+ vpxor %%XMM4, %%T_key
+ vpxor %%XMM5, %%T_key
+ vpxor %%XMM6, %%T_key
+ vpxor %%XMM7, %%T_key
+ vpxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep NROUNDS
+ vmovdqu %%T_key, [%%GDATA_KEY+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*i]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA - (GCM key data), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; r11 is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%T1
+ vpxor %%XMM2, %%T1
+ vpxor %%XMM3, %%T1
+ vpxor %%XMM4, %%T1
+ vpxor %%XMM5, %%T1
+ vpxor %%XMM6, %%T1
+ vpxor %%XMM7, %%T1
+ vpxor %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+
+ vpshufd %%T6, %%T2, 01001110b
+ vpxor %%T6, %%T2
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
+
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpxor %%T6, %%T4
+ vpxor %%T6, %%T7
+
+%ifdef GCM128_MODE
+ vmovdqu %%T5, [%%GDATA + 16*10]
+%endif
+%ifdef GCM192_MODE
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+%endif
+%ifdef GCM256_MODE
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+%endif
+
+%assign i 0
+%assign j 1
+%rep 8
+
+%ifidn %%ENC_DEC, ENC
+%ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+ vpxor %%T2, %%T2, %%T5
+%else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+%endif ; NT_LD
+ vaesenclast reg(j), reg(j), %%T2
+%else
+ VXLDR %%T2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*i]
+ vpxor %%T2, %%T2, %%T5
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*i], %%T3
+%endif ; %%ENC_DEC
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T3
+ vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
+
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK]
+ vpshufb %%XMM3, [SHUF_MASK]
+ vpshufb %%XMM4, [SHUF_MASK]
+ vpshufb %%XMM5, [SHUF_MASK]
+ vpshufb %%XMM6, [SHUF_MASK]
+ vpshufb %%XMM7, [SHUF_MASK]
+ vpshufb %%XMM8, [SHUF_MASK]
+
+
+ vpxor %%XMM1, %%T6
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+; %%GDATA is GCM key data
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpxor %%T2, %%XMM1
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vmovdqu %%T3, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpxor %%T2, %%XMM2
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpxor %%T2, %%XMM3
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpxor %%T2, %%XMM4
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpxor %%T2, %%XMM5
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpxor %%T2, %%XMM6
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpxor %%T2, %%XMM7
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpxor %%T2, %%XMM8
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T4
+ vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+%endmacro
+
+
+; Encryption of a single block
+; %%GDATA is GCM key data
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep ; NROUNDS
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ push rsi
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+
+ mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5]
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop rsi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: struct gcm_key_data *(GDATA_KEY), struct gcm_context_data *(GDATA_CTX),
+; IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%IV %3
+%define %%A_IN %4
+%define %%A_LEN %5
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
+ vpinsrq xmm2, [r10], 0
+ vpinsrd xmm2, [r10+8], 2
+ vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data * (GDATA_CTX),
+; input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA_CTX+InLen], %%PLAIN_CYPH_LEN ; Update length of data processed
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA_CTX + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA_CTX + PBlockLen], r13 ; ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ vpaddd xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm2
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm9
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: struct gcm_key_data* (GDATA_KEY), struct gcm_context_data *(GDATA_CTX) and
+; whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_avx_gen2
+; (struct gcm_key_data *key_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(precomp,_)
+FN_NAME(precomp,_):
+ endbranch
+
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, 1
+ vpsrlq xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, [POLY]
+ vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+%endif ; _nt
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(init,_)
+FN_NAME(init,_):
+ endbranch
+
+ push r12
+ push r13
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ push arg5
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+ mov arg5, [rsp + 1*16 + 8*3 + 8*5]
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+ pop arg5
+%endif
+ pop r13
+ pop r12
+ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_update_)
+FN_NAME(enc,_update_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_update_)
+FN_NAME(dec,_update_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(enc,_finalize_)
+FN_NAME(enc,_finalize_):
+ endbranch
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(dec,_finalize_)
+FN_NAME(dec,_finalize_):
+ endbranch
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_)
+FN_NAME(enc,_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_avx_gen2(
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_)
+FN_NAME(dec,_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm
new file mode 100644
index 000000000..4a0b4f82e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_avx_gen4.asm
@@ -0,0 +1,3277 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx_gen4.asm!"
+%endif
+%endif
+%endif
+
+%ifndef FUNCT_EXTENSION
+%define FUNCT_EXTENSION
+%endif
+
+;; Decide on AES-GCM key size to compile for
+%ifdef GCM128_MODE
+%define NROUNDS 9
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION
+%endif
+
+%ifdef GCM192_MODE
+%define NROUNDS 11
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION
+%endif
+
+%ifdef GCM256_MODE
+%define NROUNDS 13
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ avx_gen4 %+ FUNCT_EXTENSION
+%endif
+
+section .text
+default rel
+
+; need to push 5 registers into stack to maintain
+%define STACK_OFFSET 8*5
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxor %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxor %%T1, %%T1, %%T3
+ vpxor %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+%endmacro
+
+
+; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4
+; functions, but are kept to allow users to switch cpu architectures between calls
+; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ ;; NOTE: in current implementation check for zero length is obsolete here.
+ ;; The adequate checks are done by callers of this macro.
+ ;; cmp %%COUNTER, 0
+ ;; je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 8
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%DATA_OFFSET %6
+%define %%AAD_HASH %7
+%define %%ENC_DEC %8
+
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK]; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+%macro GHASH_SINGLE_MUL 9
+%define %%GDATA %1
+%define %%HASHKEY %2
+%define %%CIPHER %3
+%define %%STATE_11 %4
+%define %%STATE_00 %5
+%define %%STATE_MID %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%FIRST %9
+
+ vmovdqu %%T1, [%%GDATA + %%HASHKEY]
+%ifidn %%FIRST, first
+ vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
+ vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%else
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
+ vpxor %%STATE_11, %%STATE_11, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
+ vpxor %%STATE_00, %%STATE_00, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%endif
+
+%endmacro
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA_KEY %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%T2 %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ ;; Move AAD_HASH to temp reg
+ vmovdqu %%T2, %%XMM8
+ ;; Start AES for %%num_initial_blocks blocks
+ ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+%if(%%num_initial_blocks>0)
+vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%endif ; %if(%%num_initial_blocks>0)
+
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; Write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [SHUF_MASK]
+%assign i (i+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%if(%%num_initial_blocks>0)
+ vmovdqa %%T3, reg(i)
+%assign i (i+1)
+
+%rep %%num_initial_blocks-1
+ vmovdqu [rsp + TMP %+ i], reg(i)
+%assign i (i+1)
+%endrep
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Haskey_i_k holds XORed values of the low and high parts of
+ ;; the Haskey_i
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR Y0
+ vpaddd %%XMM2, %%CTR, [TWO] ; INCR Y0
+ vpaddd %%XMM3, %%XMM1, [TWO] ; INCR Y0
+ vpaddd %%XMM4, %%XMM2, [TWO] ; INCR Y0
+ vpaddd %%XMM5, %%XMM3, [TWO] ; INCR Y0
+ vpaddd %%XMM6, %%XMM4, [TWO] ; INCR Y0
+ vpaddd %%XMM7, %%XMM5, [TWO] ; INCR Y0
+ vpaddd %%XMM8, %%XMM6, [TWO] ; INCR Y0
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+ vpxor %%XMM1, %%XMM1, %%T_key
+ vpxor %%XMM2, %%XMM2, %%T_key
+ vpxor %%XMM3, %%XMM3, %%T_key
+ vpxor %%XMM4, %%XMM4, %%T_key
+ vpxor %%XMM5, %%XMM5, %%T_key
+ vpxor %%XMM6, %%XMM6, %%T_key
+ vpxor %%XMM7, %%XMM7, %%T_key
+ vpxor %%XMM8, %%XMM8, %%T_key
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+
+%define %%T4_2 %%T4
+%if(%%num_initial_blocks>0)
+ ;; Hash in AES state
+ ;; T2 - incoming AAD hash
+ vpxor %%T2, %%T3
+
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*1]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*2]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>1)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*3]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*4]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>2)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>3)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*5]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*6]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>4)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*7]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*8]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>5)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*9]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+%ifndef GCM128_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*10]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>6)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM128_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*10]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif
+
+%ifdef GCM192_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*11]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*12]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif
+%ifdef GCM256_MODE
+ vmovdqu %%T_key, [%%GDATA_KEY+16*11]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*12]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%if(%%num_initial_blocks>7)
+ ;; GDATA, HASHKEY, CIPHER,
+ ;; STATE_11, STATE_00, STATE_MID, T1, T2
+ vmovdqu %%T2, [rsp + TMP %+ j]
+ GHASH_SINGLE_MUL %%GDATA_KEY, HashKey_ %+ k, %%T2, \
+ %%T1, %%T4, %%T6, %%T5, %%T3, not_first
+%endif
+
+%ifdef GCM256_MODE ; GCM256
+ vmovdqu %%T_key, [%%GDATA_KEY+16*13]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+
+ vmovdqu %%T_key, [%%GDATA_KEY+16*14]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+%endif ; GCM256 mode
+
+%if(%%num_initial_blocks>0)
+ vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
+ vpxor %%T4, %%T6, %%T4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; First phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ ;; First phase of the reduction complete
+ vpxor %%T4, %%T4, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T4, 0x00
+ ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+ vpsrldq %%T2, %%T2, 4
+
+ vpclmulqdq %%T4, %%T3, %%T4, 0x10
+ ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+ vpslldq %%T4, %%T4, 4
+ ;; Second phase of the reduction complete
+ vpxor %%T4, %%T4, %%T2
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; The result is in %%T3
+ vpxor %%T3, %%T1, %%T4
+%else
+ ;; The hash should end up in T3
+ vmovdqa %%T3, %%T2
+%endif
+
+ ;; Final hash is now in T3
+%if %%num_initial_blocks > 0
+ ;; NOTE: obsolete in case %%num_initial_blocks = 0
+ sub %%LENGTH, 16*%%num_initial_blocks
+%endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+%if %%num_initial_blocks > 0
+ ;; NOTE: 'jl' is never taken for %%num_initial_blocks = 0
+ ;; This macro is executed for lenght 128 and up,
+ ;; zero length is checked in GCM_ENC_DEC.
+ ;; If the last block is partial then the xor will be done later
+ ;; in ENCRYPT_FINAL_PARTIAL_BLOCK.
+ ;; We know it's partial if LENGTH - 16*num_initial_blocks < 128
+ cmp %%LENGTH, 128
+ jl %%_initial_skip_last_word_write
+%endif
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ ;; Update %%LENGTH with the number of blocks processed
+ sub %%LENGTH, 16
+ add %%DATA_OFFSET, 16
+%%_initial_skip_last_word_write:
+ sub %%LENGTH, 128-16
+ add %%DATA_OFFSET, 128-16
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ ;; Combine GHASHed value with the corresponding ciphertext
+ vpxor %%XMM1, %%XMM1, %%T3
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+;;; INITIAL_BLOCKS macro with support for a partial final block.
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 25
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%LENGTH %5
+%define %%DATA_OFFSET %6
+%define %%num_initial_blocks %7 ; can be 1, 2, 3, 4, 5, 6 or 7 (not 0)
+%define %%T1 %8
+%define %%T2 %9
+%define %%T3 %10
+%define %%T4 %11
+%define %%T5 %12
+%define %%CTR %13
+%define %%XMM1 %14
+%define %%XMM2 %15
+%define %%XMM3 %16
+%define %%XMM4 %17
+%define %%XMM5 %18
+%define %%XMM6 %19
+%define %%XMM7 %20
+%define %%XMM8 %21
+%define %%T6 %22
+%define %%T_key %23
+%define %%ENC_DEC %24
+%define %%INSTANCE_TYPE %25
+
+%assign i (8-%%num_initial_blocks)
+ ;; Move AAD_HASH to temp reg
+ vmovdqu %%T2, %%XMM8
+ ;; vmovdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ ;; Compute AES counters
+ vpaddd %%CTR, %%CTR, [rel ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [rel SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ ; Start AES for %%num_initial_blocks blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Hash all but the last block of data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%assign i (9-%%num_initial_blocks)
+%if %%num_initial_blocks > 0
+%rep %%num_initial_blocks-1
+ ;; Encrypt the message for all but the last block
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+%assign i (i+1)
+%endrep
+%endif
+ ;; The final block of data may be <16B
+ sub %%LENGTH, 16*(%%num_initial_blocks-1)
+
+%if %%num_initial_blocks < 8
+ ;; NOTE: the 'jl' is always taken for num_initial_blocks = 8.
+ ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 128.
+ cmp %%LENGTH, 16
+ jl %%_small_initial_partial_block
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle a full length final block - encrypt and hash all blocks
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ sub %%LENGTH, 16
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+
+ ;; Encrypt the message
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ ;; write back ciphertext for %%num_initial_blocks blocks
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i)
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ ;; Prepare ciphertext for GHASH computations
+ vpshufb reg(i), [rel SHUF_MASK]
+
+ ;; Hash all of the data
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+
+%if(%%num_initial_blocks>last_block_to_hash)
+ ;; Hash in AES state
+ vpxor %%T2, reg(j)
+
+ ;; T2 - incoming AAD hash
+ ;; reg(i) holds ciphertext
+ ;; T5 - hash key
+ ;; T6 - updated xor
+ ;; reg(1)/xmm1 should now be available for tmp use
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%assign rep_count (%%num_initial_blocks-1)
+%if rep_count > 0
+%rep rep_count
+
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T3, reg(j), %%T5, 0x11
+ vpxor %%T1, %%T1, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x00
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+%endif
+ ;; Record that a reduction is needed
+ mov r12, 1
+
+ jmp %%_small_initial_compute_hash
+
+
+%endif ; %if %%num_initial_blocks < 8
+
+%%_small_initial_partial_block:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle ghash for a <16B final block
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; In this case if it's a single call to encrypt we can
+ ;; hash all of the data but if it's an init / update / finalize
+ ;; series of call we need to leave the last block if it's
+ ;; less than a full block of data.
+
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], reg(i)
+ ;; Handle a partial final block
+ ;; GDATA, KEY, T1, T2
+ ;; r13 - length
+ ;; LT16 - indicates type of read and that the buffer is less than 16 bytes long
+ ;; NOTE: could be replaced with %%LENGTH but at this point
+ ;; %%LENGTH is always less than 16.
+ ;; No PLAIN_CYPH_LEN argument available in this macro.
+ ENCRYPT_FINAL_PARTIAL_BLOCK reg(i), %%T1, %%T3, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, LT16, %%ENC_DEC, %%DATA_OFFSET
+ vpshufb reg(i), [SHUF_MASK]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks-1)
+%assign last_block_to_hash 1
+%else
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+%endif
+
+%if(%%num_initial_blocks>last_block_to_hash)
+ ;; Record that a reduction is needed
+ mov r12, 1
+ ;; Hash in AES state
+ vpxor %%T2, reg(j)
+
+ ;; T2 - incoming AAD hash
+ ;; reg(i) holds ciphertext
+ ;; T5 - hash key
+ ;; T6 - updated xor
+ ;; reg(1)/xmm1 should now be available for tmp use
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T1, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T4, %%T2, %%T5, 0x00 ; %%T4 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+%else
+ ;; Record that a reduction is not needed -
+ ;; In this case no hashes are computed because there
+ ;; is only one initial block and it is < 16B in length.
+ mov r12, 0
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign rep_count (%%num_initial_blocks-2)
+%%_multi_call_hash:
+%else
+%assign rep_count (%%num_initial_blocks-1)
+%endif
+%if rep_count > 0
+%rep rep_count
+
+ vmovdqu %%T5, [%%GDATA_KEY + HashKey_ %+ k]
+ vpclmulqdq %%T3, reg(j), %%T5, 0x11
+ vpxor %%T1, %%T1, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x00
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, reg(j), %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+%assign i (i+1)
+%assign j (j+1)
+%assign k (k-1)
+%endrep
+%endif
+
+%%_small_initial_compute_hash:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Ghash reduction
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; We only need to check if a reduction is needed if
+ ;; initial_blocks == 1 and init/update/final is being used.
+ ;; In this case we may just have a partial block, and that
+ ;; gets hashed in finalize.
+ cmp r12, 0
+ je %%_no_reduction_needed
+%endif
+%endif
+
+ vpsrldq %%T3, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpslldq %%T6, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpxor %%T1, %%T1, %%T3 ; accumulate the results in %%T1:%%T4
+ vpxor %%T4, %%T6, %%T4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; First phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x01
+ ;; shift-L xmm2 2 DWs
+ vpslldq %%T2, %%T2, 8
+ vpxor %%T4, %%T4, %%T2
+
+ ;; First phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Second phase of the reduction
+
+ vpclmulqdq %%T2, %%T3, %%T4, 0x00
+ ;; Shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+ vpsrldq %%T2, %%T2, 4
+
+ vpclmulqdq %%T4, %%T3, %%T4, 0x10
+ ;; Shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+ vpslldq %%T4, %%T4, 4
+
+ vpxor %%T4, %%T4, %%T2
+ ;; Second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T3, %%T1, %%T4
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If using init/update/finalize, we need to xor any partial block data
+ ;; into the hash.
+%if %%num_initial_blocks > 1
+ ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
+%if %%num_initial_blocks != 8
+ ;; NOTE: for %%num_initial_blocks = 8, %%LENGTH, stored in [PBlockLen] is never zero
+ cmp qword [%%GDATA_CTX + PBlockLen], 0
+ je %%_no_partial_block_xor
+%endif ; %%num_initial_blocks != 8
+ vpxor %%T3, %%T3, reg(8)
+%%_no_partial_block_xor:
+%endif ; %%num_initial_blocks > 1
+%endif ; %%INSTANCE_TYPE, multi_call
+
+%if(%%num_initial_blocks=1)
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%_no_reduction_needed case only valid for
+ ;; multi_call with initial_blocks = 1.
+ ;; Look for comment above around '_no_reduction_needed'
+ ;; The jmp below is obsolete as the code will fall through.
+
+ ;; The result is in %%T3
+ jmp %%_after_reduction
+
+%%_no_reduction_needed:
+ ;; The hash should end up in T3. The only way we should get here is if
+ ;; there is a partial block of data, so xor that into the hash.
+ vpxor %%T3, %%T2, reg(8)
+%endif ; %%INSTANCE_TYPE = multi_call
+%endif ; %%num_initial_blocks=1
+
+%%_after_reduction:
+ ;; Final hash is now in T3
+
+%endmacro ; INITIAL_BLOCKS_PARTIAL
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+%define %%FULL_PARTIAL %23
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vmovdqu %%T5, [TWO]
+ vpaddd %%XMM2, %%CTR, %%T5
+ vpaddd %%XMM3, %%XMM1, %%T5
+ vpaddd %%XMM4, %%XMM2, %%T5
+ vpaddd %%XMM5, %%XMM3, %%T5
+ vpaddd %%XMM6, %%XMM4, %%T5
+ vpaddd %%XMM7, %%XMM5, %%T5
+ vpaddd %%XMM8, %%XMM6, %%T5
+ vmovdqa %%CTR, %%XMM8
+
+ vmovdqu %%T5, [SHUF_MASK]
+ vpshufb %%XMM1, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM2, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM3, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM4, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM5, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM6, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM7, %%T5 ; perform a 16Byte swap
+ vpshufb %%XMM8, %%T5 ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vmovdqu %%T5, [TWOf]
+ vpaddd %%XMM2, %%CTR, %%T5
+ vpaddd %%XMM3, %%XMM1, %%T5
+ vpaddd %%XMM4, %%XMM2, %%T5
+ vpaddd %%XMM5, %%XMM3, %%T5
+ vpaddd %%XMM6, %%XMM4, %%T5
+ vpaddd %%XMM7, %%XMM5, %%T5
+ vpaddd %%XMM8, %%XMM6, %%T5
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ vpxor %%XMM2, %%XMM2, %%T1
+ vpxor %%XMM3, %%XMM3, %%T1
+ vpxor %%XMM4, %%XMM4, %%T1
+ vpxor %%XMM5, %%XMM5, %%T1
+ vpxor %%XMM6, %%XMM6, %%T1
+ vpxor %%XMM7, %%XMM7, %%T1
+ vpxor %%XMM8, %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T1, %%T4, %%T3
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ %ifndef GCM128_MODE ; GCM192 or GCM256
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+%endif
+%ifdef GCM256_MODE
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+%endif ; GCM256
+
+%assign i 0
+%assign j 1
+%rep 8
+
+ ;; SNP TBD: This is pretty ugly - consider whether just XORing the
+ ;; data in after vaesenclast is simpler and performant. Would
+ ;; also have to ripple it through partial block and ghash_mul_8.
+%ifidn %%FULL_PARTIAL, full
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %endif
+
+%else
+ ; Don't read the final data during partial block processing
+ %ifdef NT_LD
+ %if (i<7)
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ ;; Stage the key directly in T2 rather than hash it with plaintext
+ vmovdqu %%T2, %%T5
+ %endif
+ %else
+ %if (i<7)
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %else
+ ;; Stage the key directly in T2 rather than hash it with plaintext
+ vmovdqu %%T2, %%T5
+ %endif
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ %if (i<7)
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ ;; Do not read the data since it could fault
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %else
+ vaesenclast reg(j), reg(j), %%T2
+ %endif
+ %endif
+%endif
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T7, %%T3
+ vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ %ifidn %%ENC_DEC, ENC
+ ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7
+ %ifidn %%FULL_PARTIAL, full
+ ;; Avoid writing past the buffer if handling a partial block
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8
+ %endif
+ %endif
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+
+ vpxor %%XMM1, %%T1
+
+
+%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM8
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_7 15
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_1]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+
+
+;;; Handle encryption of the final partial block
+;;; IN:
+;;; r13 - Number of bytes to read
+;;; MODIFIES:
+;;; KEY - Key for encrypting the partial block
+;;; HASH - Current hash value
+;;; SMASHES:
+;;; r10, r12, r15, rax
+;;; T1, T2
+;;; Note:
+;;; PLAIN_CYPH_LEN, %7, is passed only to determine
+;;; if buffer is big enough to do a 16 byte read & shift.
+;;; 'LT16' is passed here only if buffer is known to be smaller
+;;; than 16 bytes.
+;;; Any other value passed here will result in 16 byte read
+;;; code path.
+;;; TBD: Remove HASH from the instantiation
+%macro ENCRYPT_FINAL_PARTIAL_BLOCK 8
+%define %%KEY %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%CYPH_PLAIN_OUT %4
+%define %%PLAIN_CYPH_IN %5
+%define %%PLAIN_CYPH_LEN %6
+%define %%ENC_DEC %7
+%define %%DATA_OFFSET %8
+
+ ;; NOTE: type of read tuned based %%PLAIN_CYPH_LEN setting
+%ifidn %%PLAIN_CYPH_LEN, LT16
+ ;; Handle the case where the message is < 16 bytes
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+
+ ;; T1 - packed output
+ ;; r10 - input data address
+ ;; r13 - input data length
+ ;; r12, r15, rax - temp registers
+ READ_SMALL_DATA_INPUT %%T1, r10, r13, r12, r15, rax
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+%else
+ ;; Handle the case where the message is >= 16 bytes
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+ ;; Receive the last <16 Byte block
+ vmovdqu %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET]
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+ lea r12, [SHIFT_MASK + 16]
+ ;; Adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ ;; (r13 is the number of bytes in plaintext mod 16)
+ sub r12, r13
+ ;; Get the appropriate shuffle mask
+ vmovdqu %%T2, [r12]
+ ;; shift right 16-r13 bytes
+ vpshufb %%T1, %%T2
+%endif ; %%PLAIN_CYPH_LEN, LT16
+
+ ;; At this point T1 contains the partial block data
+%ifidn %%ENC_DEC, DEC
+ ;; Plaintext XOR E(K, Yn)
+ ;; Set aside the ciphertext
+ vmovdqa %%T2, %%T1
+ vpxor %%KEY, %%KEY, %%T1
+ ;; Get the appropriate mask to mask out top 16-r13 bytes of ciphertext
+ vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK]
+ ;; Mask out top 16-r13 bytes of ciphertext
+ vpand %%KEY, %%KEY, %%T1
+
+ ;; Prepare the ciphertext for the hash
+ ;; mask out top 16-r13 bytes of the plaintext
+ vpand %%T2, %%T2, %%T1
+%else
+ ;; Plaintext XOR E(K, Yn)
+ vpxor %%KEY, %%KEY, %%T1
+ ;; Get the appropriate mask to mask out top 16-r13 bytes of %%KEY
+ vmovdqu %%T1, [r12 + ALL_F - SHIFT_MASK]
+ ;; Mask out top 16-r13 bytes of %%KEY
+ vpand %%KEY, %%KEY, %%T1
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Output r13 Bytes
+ vmovq rax, %%KEY
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq %%T1, %%KEY, 8
+ vmovq rax, %%T1
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn %%ENC_DEC, DEC
+ ;; If decrypt, restore the ciphertext into %%KEY
+ vmovdqu %%KEY, %%T2
+%endif
+%endmacro ; ENCRYPT_FINAL_PARTIAL_BLOCK
+
+
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ push rsi
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+
+ mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5]
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14, [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13, [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12, [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11, [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10, [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9, [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8, [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7, [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop rsi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%IV %3
+%define %%A_IN %4
+%define %%A_LEN %5
+%define %%AAD_HASH xmm14
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA_KEY + HashKey]
+
+ mov r10, %%A_LEN
+ cmp r10, 0
+ je %%_aad_is_zero
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ jmp %%_after_aad
+
+%%_aad_is_zero:
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+%%_after_aad:
+ mov r10, %%A_LEN
+ vpxor xmm2, xmm3
+
+ vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
+ vpinsrq xmm2, [r10], 0
+ vpinsrd xmm2, [r10+8], 2
+ vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
+%endmacro
+
+%macro GCM_ENC_DEC_SMALL 12
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET %7
+%define %%LENGTH %8
+%define %%NUM_BLOCKS %9
+%define %%CTR %10
+%define %%HASH %11
+%define %%INSTANCE_TYPE %12
+
+ ;; NOTE: the check below is obsolete in current implementation. The check is already done in GCM_ENC_DEC.
+ ;; cmp %%NUM_BLOCKS, 0
+ ;; je %%_small_initial_blocks_encrypted
+ cmp %%NUM_BLOCKS, 8
+ je %%_small_initial_num_blocks_is_8
+ cmp %%NUM_BLOCKS, 7
+ je %%_small_initial_num_blocks_is_7
+ cmp %%NUM_BLOCKS, 6
+ je %%_small_initial_num_blocks_is_6
+ cmp %%NUM_BLOCKS, 5
+ je %%_small_initial_num_blocks_is_5
+ cmp %%NUM_BLOCKS, 4
+ je %%_small_initial_num_blocks_is_4
+ cmp %%NUM_BLOCKS, 3
+ je %%_small_initial_num_blocks_is_3
+ cmp %%NUM_BLOCKS, 2
+ je %%_small_initial_num_blocks_is_2
+
+ jmp %%_small_initial_num_blocks_is_1
+
+
+%%_small_initial_num_blocks_is_8:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 8, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_7:
+ ;; r13 - %%LENGTH
+ ;; xmm12 - T1
+ ;; xmm13 - T2
+ ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
+ ;; xmm15 - T4
+ ;; xmm11 - T5
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8 - AAD HASH IN
+ ;; xmm10 - T6
+ ;; xmm0 - T_key
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_6:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_5:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_4:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_3:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_2:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+ jmp %%_small_initial_blocks_encrypted
+
+%%_small_initial_num_blocks_is_1:
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC, %%INSTANCE_TYPE
+
+ ;; Note: zero initial blocks not allowed.
+
+%%_small_initial_blocks_encrypted:
+
+%endmacro ; GCM_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 7
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%INSTANCE_TYPE %7
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_enc_dec_done
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ ;; Update length of data processed
+ add [%%GDATA_CTX+InLen], %%PLAIN_CYPH_LEN
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ vmovdqu xmm8, [%%GDATA_CTX + AadHash]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: partial block processing makes only sense for multi_call here.
+ ;; Used for the update flow - if there was a previous partial
+ ;; block fill the remaining bytes here.
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+%endif
+
+ ;; lift CTR set from initial_blocks to here
+%ifidn %%INSTANCE_TYPE, single_call
+ vmovdqu xmm9, xmm2
+%else
+ vmovdqu xmm9, [%%GDATA_CTX + CurCount]
+%endif
+
+ ;; Save the amount of data left to process in r10
+ mov r13, %%PLAIN_CYPH_LEN
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%DATA_OFFSET is zero in single_call case.
+ ;; Consequently PLAIN_CYPH_LEN will never be zero after
+ ;; %%DATA_OFFSET subtraction below.
+ sub r13, %%DATA_OFFSET
+
+ ;; There may be no more data if it was consumed in the partial block.
+ cmp r13, 0
+ je %%_enc_dec_done
+%endif ; %%INSTANCE_TYPE, multi_call
+ mov r10, r13
+
+ ;; Determine how many blocks to process in INITIAL
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ ;; Process one additional block in INITIAL if there is a partial block
+ and r10, 0xf
+ blsmsk r10, r10 ; Set CF if zero
+ cmc ; Flip CF
+ adc r12, 0x0 ; Process an additional INITIAL block if CF set
+
+ ;; Less than 127B will be handled by the small message code, which
+ ;; can process up to 7 16B blocks.
+ cmp r13, 128
+ jge %%_large_message_path
+
+ GCM_ENC_DEC_SMALL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, r13, r12, xmm9, xmm14, %%INSTANCE_TYPE
+ jmp %%_ghash_done
+
+%%_large_message_path:
+ and r12, 0x7 ; Still, don't allow 8 INITIAL blocks since this will
+ ; can be handled by the x8 partial loop.
+
+ cmp r12, 0
+ je %%_initial_num_blocks_is_0
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ ;; r13 - %%LENGTH
+ ;; xmm12 - T1
+ ;; xmm13 - T2
+ ;; xmm14 - T3 - AAD HASH OUT when not producing 8 AES keys
+ ;; xmm15 - T4
+ ;; xmm11 - T5
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1 - Cipher + Hash when producing 8 AES keys
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8 - AAD HASH IN
+ ;; xmm10 - T6
+ ;; xmm0 - T_key
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ ;; The entire message was encrypted processed in initial and now need to be hashed
+ cmp r13, 0
+ je %%_encrypt_done
+
+ ;; Encrypt the final <16 byte (partial) block, then hash
+ cmp r13, 16
+ jl %%_encrypt_final_partial
+
+ ;; Process 7 full blocks plus a partial block
+ cmp r13, 128
+ jl %%_encrypt_by_8_partial
+
+
+%%_encrypt_by_8_parallel:
+ ;; in_order vs. out_order is an optimization to increment the counter without shuffling
+ ;; it back into little endian. r15d keeps track of when we need to increent in order so
+ ;; that the carry is handled correctly.
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [rel SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ ;; xmm0 - T1
+ ;; xmm10 - T2
+ ;; xmm11 - T3
+ ;; xmm12 - T4
+ ;; xmm13 - T5
+ ;; xmm14 - T6
+ ;; xmm9 - CTR
+ ;; xmm1 - XMM1
+ ;; xmm2 - XMM2
+ ;; xmm3 - XMM3
+ ;; xmm4 - XMM4
+ ;; xmm5 - XMM5
+ ;; xmm6 - XMM6
+ ;; xmm7 - XMM7
+ ;; xmm8 - XMM8
+ ;; xmm15 - T7
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC, full
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ cmp r13, 128
+ jge %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_encrypt_by_8_parallel_done
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, full
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ cmp r13, 128
+ jge %%_encrypt_by_8_new
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_parallel_done:
+ ;; Test to see if we need a by 8 with partial block. At this point
+ ;; bytes remaining should be either zero or between 113-127.
+ cmp r13, 0
+ je %%_encrypt_done
+
+%%_encrypt_by_8_partial:
+ ;; Shuffle needed to align key for partial block xor. out_order
+ ;; is a little faster because it avoids extra shuffles.
+ ;; TBD: Might need to account for when we don't have room to increment the counter.
+
+
+ ;; Process parallel buffers with a final partial block.
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC, partial
+
+
+ add %%DATA_OFFSET, 128-16
+ sub r13, 128-16
+
+%%_encrypt_final_partial:
+
+ vpshufb xmm8, [SHUF_MASK]
+ mov [%%GDATA_CTX + PBlockLen], r13
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm8
+
+ ;; xmm8 - Final encrypted counter - need to hash with partial or full block ciphertext
+ ;; GDATA, KEY, T1, T2
+ ENCRYPT_FINAL_PARTIAL_BLOCK xmm8, xmm0, xmm10, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET
+
+ vpshufb xmm8, [SHUF_MASK]
+
+
+%%_encrypt_done:
+
+ ;; Mapping to macro parameters
+ ;; IN:
+ ;; xmm9 contains the counter
+ ;; xmm1-xmm8 contain the xor'd ciphertext
+ ;; OUT:
+ ;; xmm14 contains the final hash
+ ;; GDATA, T1, T2, T3, T4, T5, T6, T7, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+%ifidn %%INSTANCE_TYPE, multi_call
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ jz %%_hash_last_8
+ GHASH_LAST_7 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
+ ;; XOR the partial word into the hash
+ vpxor xmm14, xmm14, xmm8
+ jmp %%_ghash_done
+%endif
+%%_hash_last_8:
+ GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+%%_ghash_done:
+ vmovdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+
+%%_enc_dec_done:
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%INSTANCE_TYPE %6
+%define %%PLAIN_CYPH_LEN rax
+
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ ;; Start AES as early as possible
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If the GCM function is called as a single function call rather
+ ;; than invoking the individual parts (init, update, finalize) we
+ ;; can remove a write to read dependency on AadHash.
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+
+ ;; Encrypt the final partial block. If we did this as a single call then
+ ;; the partial block was handled in the main GCM_ENC_DEC macro.
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+%endif
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpxor xmm9, xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_avx_gen4 /
+; aes_gcm_precomp_192_avx_gen4 /
+; aes_gcm_precomp_256_avx_gen4
+; (struct gcm_key_data *key_data)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(precomp,_)
+FN_NAME(precomp,_):
+ endbranch
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [rel SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, xmm2, [POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(init,_)
+FN_NAME(init,_):
+ endbranch
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ push arg5
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+ mov arg5, [rsp + 1*16 + 8*3 + 8*5]
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + 0*16]
+ add rsp, 1*16
+ pop arg5
+%endif
+ pop r13
+ pop r12
+ ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 /
+; aes_gcm_enc_128_update_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_update_)
+FN_NAME(enc,_update_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 /
+; aes_gcm_dec_256_update_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_update_)
+FN_NAME(dec,_update_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 /
+; aes_gcm_enc_256_finalize_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(enc,_finalize_)
+FN_NAME(enc,_finalize_):
+ endbranch
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm9
+ vmovdqu [rsp + 2*16], xmm11
+ vmovdqu [rsp + 3*16], xmm14
+ vmovdqu [rsp + 4*16], xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 4*16]
+ vmovdqu xmm14, [rsp + 3*16]
+ vmovdqu xmm11, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4
+; aes_gcm_dec_256_finalize_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(dec,_finalize_)
+FN_NAME(dec,_finalize_):
+ endbranch
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm9
+ vmovdqu [rsp + 2*16], xmm11
+ vmovdqu [rsp + 3*16], xmm14
+ vmovdqu [rsp + 4*16], xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + 4*16]
+ vmovdqu xmm14, [rsp + 3*16]
+ vmovdqu xmm11, [rsp + 2*16]
+ vmovdqu xmm9, [rsp + 1*16]
+ vmovdqu xmm6, [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_)
+FN_NAME(enc,_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_)
+FN_NAME(dec,_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm
new file mode 100644
index 000000000..e823b7959
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm
@@ -0,0 +1,291 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef GCM_DEFINES_ASM_INCLUDED
+%define GCM_DEFINES_ASM_INCLUDED
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+
+
+;;;;;;
+
+section .data
+
+align 16
+
+POLY dq 0x0000000000000001, 0xC200000000000000
+
+align 64
+POLY2 dq 0x00000001C2000000, 0xC200000000000000
+ dq 0x00000001C2000000, 0xC200000000000000
+ dq 0x00000001C2000000, 0xC200000000000000
+ dq 0x00000001C2000000, 0xC200000000000000
+align 16
+TWOONE dq 0x0000000000000001, 0x0000000100000000
+
+; order of these constants should not change.
+; more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+align 64
+SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+ dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+
+SHIFT_MASK dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+ALL_F dq 0xffffffffffffffff, 0xffffffffffffffff
+ZERO dq 0x0000000000000000, 0x0000000000000000
+ONE dq 0x0000000000000001, 0x0000000000000000
+TWO dq 0x0000000000000002, 0x0000000000000000
+ONEf dq 0x0000000000000000, 0x0100000000000000
+TWOf dq 0x0000000000000000, 0x0200000000000000
+
+align 64
+ddq_add_1234:
+ dq 0x0000000000000001, 0x0000000000000000
+ dq 0x0000000000000002, 0x0000000000000000
+ dq 0x0000000000000003, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_5678:
+ dq 0x0000000000000005, 0x0000000000000000
+ dq 0x0000000000000006, 0x0000000000000000
+ dq 0x0000000000000007, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_add_4444:
+ dq 0x0000000000000004, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+ dq 0x0000000000000004, 0x0000000000000000
+
+align 64
+ddq_add_8888:
+ dq 0x0000000000000008, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+ dq 0x0000000000000008, 0x0000000000000000
+
+align 64
+ddq_addbe_1234:
+ dq 0x0000000000000000, 0x0100000000000000
+ dq 0x0000000000000000, 0x0200000000000000
+ dq 0x0000000000000000, 0x0300000000000000
+ dq 0x0000000000000000, 0x0400000000000000
+
+align 64
+ddq_addbe_5678:
+ dq 0x0000000000000000, 0x0500000000000000
+ dq 0x0000000000000000, 0x0600000000000000
+ dq 0x0000000000000000, 0x0700000000000000
+ dq 0x0000000000000000, 0x0800000000000000
+
+align 64
+ddq_addbe_4444:
+ dq 0x0000000000000000, 0x0400000000000000
+ dq 0x0000000000000000, 0x0400000000000000
+ dq 0x0000000000000000, 0x0400000000000000
+ dq 0x0000000000000000, 0x0400000000000000
+
+align 64
+ddq_addbe_8888:
+ dq 0x0000000000000000, 0x0800000000000000
+ dq 0x0000000000000000, 0x0800000000000000
+ dq 0x0000000000000000, 0x0800000000000000
+ dq 0x0000000000000000, 0x0800000000000000
+
+align 64
+byte_len_to_mask_table:
+ dw 0x0000, 0x0001, 0x0003, 0x0007,
+ dw 0x000f, 0x001f, 0x003f, 0x007f,
+ dw 0x00ff, 0x01ff, 0x03ff, 0x07ff,
+ dw 0x0fff, 0x1fff, 0x3fff, 0x7fff,
+ dw 0xffff
+
+align 64
+byte64_len_to_mask_table:
+ dq 0x0000000000000000, 0x0000000000000001
+ dq 0x0000000000000003, 0x0000000000000007
+ dq 0x000000000000000f, 0x000000000000001f
+ dq 0x000000000000003f, 0x000000000000007f
+ dq 0x00000000000000ff, 0x00000000000001ff
+ dq 0x00000000000003ff, 0x00000000000007ff
+ dq 0x0000000000000fff, 0x0000000000001fff
+ dq 0x0000000000003fff, 0x0000000000007fff
+ dq 0x000000000000ffff, 0x000000000001ffff
+ dq 0x000000000003ffff, 0x000000000007ffff
+ dq 0x00000000000fffff, 0x00000000001fffff
+ dq 0x00000000003fffff, 0x00000000007fffff
+ dq 0x0000000000ffffff, 0x0000000001ffffff
+ dq 0x0000000003ffffff, 0x0000000007ffffff
+ dq 0x000000000fffffff, 0x000000001fffffff
+ dq 0x000000003fffffff, 0x000000007fffffff
+ dq 0x00000000ffffffff, 0x00000001ffffffff
+ dq 0x00000003ffffffff, 0x00000007ffffffff
+ dq 0x0000000fffffffff, 0x0000001fffffffff
+ dq 0x0000003fffffffff, 0x0000007fffffffff
+ dq 0x000000ffffffffff, 0x000001ffffffffff
+ dq 0x000003ffffffffff, 0x000007ffffffffff
+ dq 0x00000fffffffffff, 0x00001fffffffffff
+ dq 0x00003fffffffffff, 0x00007fffffffffff
+ dq 0x0000ffffffffffff, 0x0001ffffffffffff
+ dq 0x0003ffffffffffff, 0x0007ffffffffffff
+ dq 0x000fffffffffffff, 0x001fffffffffffff
+ dq 0x003fffffffffffff, 0x007fffffffffffff
+ dq 0x00ffffffffffffff, 0x01ffffffffffffff
+ dq 0x03ffffffffffffff, 0x07ffffffffffffff
+ dq 0x0fffffffffffffff, 0x1fffffffffffffff
+ dq 0x3fffffffffffffff, 0x7fffffffffffffff
+ dq 0xffffffffffffffff
+
+align 64
+mask_out_top_block:
+ dq 0xffffffffffffffff, 0xffffffffffffffff
+ dq 0xffffffffffffffff, 0xffffffffffffffff
+ dq 0xffffffffffffffff, 0xffffffffffffffff
+ dq 0x0000000000000000, 0x0000000000000000
+
+section .text
+
+
+;;define the fields of gcm_data struct
+;typedef struct gcm_data
+;{
+; u8 expanded_keys[16*15];
+; u8 shifted_hkey_1[16]; // store HashKey <<1 mod poly here
+; u8 shifted_hkey_2[16]; // store HashKey^2 <<1 mod poly here
+; u8 shifted_hkey_3[16]; // store HashKey^3 <<1 mod poly here
+; u8 shifted_hkey_4[16]; // store HashKey^4 <<1 mod poly here
+; u8 shifted_hkey_5[16]; // store HashKey^5 <<1 mod poly here
+; u8 shifted_hkey_6[16]; // store HashKey^6 <<1 mod poly here
+; u8 shifted_hkey_7[16]; // store HashKey^7 <<1 mod poly here
+; u8 shifted_hkey_8[16]; // store HashKey^8 <<1 mod poly here
+; u8 shifted_hkey_1_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_2_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_3_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_4_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_5_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_6_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_7_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_8_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+;} gcm_data;
+
+%ifndef GCM_KEYS_VAES_AVX512_INCLUDED
+%define HashKey 16*15 ; store HashKey <<1 mod poly here
+%define HashKey_1 16*15 ; store HashKey <<1 mod poly here
+%define HashKey_2 16*16 ; store HashKey^2 <<1 mod poly here
+%define HashKey_3 16*17 ; store HashKey^3 <<1 mod poly here
+%define HashKey_4 16*18 ; store HashKey^4 <<1 mod poly here
+%define HashKey_5 16*19 ; store HashKey^5 <<1 mod poly here
+%define HashKey_6 16*20 ; store HashKey^6 <<1 mod poly here
+%define HashKey_7 16*21 ; store HashKey^7 <<1 mod poly here
+%define HashKey_8 16*22 ; store HashKey^8 <<1 mod poly here
+%define HashKey_k 16*23 ; store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_2_k 16*24 ; store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_3_k 16*25 ; store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_4_k 16*26 ; store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_5_k 16*27 ; store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_6_k 16*28 ; store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_7_k 16*29 ; store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_8_k 16*30 ; store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+%endif
+
+%define AadHash 16*0 ; store current Hash of data which has been input
+%define AadLen 16*1 ; store length of input data which will not be encrypted or decrypted
+%define InLen (16*1)+8 ; store length of input data which will be encrypted or decrypted
+%define PBlockEncKey 16*2 ; encryption key for the partial block at the end of the previous update
+%define OrigIV 16*3 ; input IV
+%define CurCount 16*4 ; Current counter for generation of encryption key
+%define PBlockLen 16*5 ; length of partial block at the end of the previous update
+
+%define reg(q) xmm %+ q
+%define arg(x) [r14 + STACK_OFFSET + 8*x]
+
+
+
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+ %xdefine arg4 r9
+ %xdefine arg5 rsi ;[r14 + STACK_OFFSET + 8*5] - need push and load
+ %xdefine arg6 [r14 + STACK_OFFSET + 8*6]
+ %xdefine arg7 [r14 + STACK_OFFSET + 8*7]
+ %xdefine arg8 [r14 + STACK_OFFSET + 8*8]
+ %xdefine arg9 [r14 + STACK_OFFSET + 8*9]
+ %xdefine arg10 [r14 + STACK_OFFSET + 8*10]
+
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+ %xdefine arg4 rcx
+ %xdefine arg5 r8
+ %xdefine arg6 r9
+ %xdefine arg7 [r14 + STACK_OFFSET + 8*1]
+ %xdefine arg8 [r14 + STACK_OFFSET + 8*2]
+ %xdefine arg9 [r14 + STACK_OFFSET + 8*3]
+ %xdefine arg10 [r14 + STACK_OFFSET + 8*4]
+%endif
+
+%ifdef NT_LDST
+ %define NT_LD
+ %define NT_ST
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_LD
+ %define XLDR movntdqa
+ %define VXLDR vmovntdqa
+ %define VX512LDR vmovntdqa
+%else
+ %define XLDR movdqu
+ %define VXLDR vmovdqu
+ %define VX512LDR vmovdqu8
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_ST
+ %define XSTR movntdq
+ %define VXSTR vmovntdq
+ %define VX512STR vmovntdq
+%else
+ %define XSTR movdqu
+ %define VXSTR vmovdqu
+ %define VX512STR vmovdqu8
+%endif
+
+%endif ; GCM_DEFINES_ASM_INCLUDED
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm
new file mode 100644
index 000000000..fd8aa05a6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_keys_vaes_avx512.asm
@@ -0,0 +1,233 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef GCM_KEYS_VAES_AVX512_INCLUDED
+%define GCM_KEYS_VAES_AVX512_INCLUDED
+
+;; Define the fields of gcm_key_data struct:
+;; uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+;; uint8_t shifted_hkey_9_128[GCM_ENC_KEY_LEN * (128 - 8)];
+;; uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // HashKey^8 <<1 mod poly
+;; uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // HashKey^7 <<1 mod poly
+;; uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // HashKey^6 <<1 mod poly
+;; uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // HashKey^5 <<1 mod poly
+;; uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // HashKey^4 <<1 mod poly
+;; uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // HashKey^3 <<1 mod poly
+;; uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // HashKey^2 <<1 mod poly
+;; uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // HashKey <<1 mod poly
+
+%ifdef GCM_BIG_DATA
+;;
+;; Key structure holds up to 128 ghash keys
+;;
+%define HashKey_128 (16*15) ; HashKey^128 <<1 mod poly
+%define HashKey_127 (16*16) ; HashKey^127 <<1 mod poly
+%define HashKey_126 (16*17) ; HashKey^126 <<1 mod poly
+%define HashKey_125 (16*18) ; HashKey^125 <<1 mod poly
+%define HashKey_124 (16*19) ; HashKey^124 <<1 mod poly
+%define HashKey_123 (16*20) ; HashKey^123 <<1 mod poly
+%define HashKey_122 (16*21) ; HashKey^122 <<1 mod poly
+%define HashKey_121 (16*22) ; HashKey^121 <<1 mod poly
+%define HashKey_120 (16*23) ; HashKey^120 <<1 mod poly
+%define HashKey_119 (16*24) ; HashKey^119 <<1 mod poly
+%define HashKey_118 (16*25) ; HashKey^118 <<1 mod poly
+%define HashKey_117 (16*26) ; HashKey^117 <<1 mod poly
+%define HashKey_116 (16*27) ; HashKey^116 <<1 mod poly
+%define HashKey_115 (16*28) ; HashKey^115 <<1 mod poly
+%define HashKey_114 (16*29) ; HashKey^114 <<1 mod poly
+%define HashKey_113 (16*30) ; HashKey^113 <<1 mod poly
+%define HashKey_112 (16*31) ; HashKey^112 <<1 mod poly
+%define HashKey_111 (16*32) ; HashKey^111 <<1 mod poly
+%define HashKey_110 (16*33) ; HashKey^110 <<1 mod poly
+%define HashKey_109 (16*34) ; HashKey^109 <<1 mod poly
+%define HashKey_108 (16*35) ; HashKey^108 <<1 mod poly
+%define HashKey_107 (16*36) ; HashKey^107 <<1 mod poly
+%define HashKey_106 (16*37) ; HashKey^106 <<1 mod poly
+%define HashKey_105 (16*38) ; HashKey^105 <<1 mod poly
+%define HashKey_104 (16*39) ; HashKey^104 <<1 mod poly
+%define HashKey_103 (16*40) ; HashKey^103 <<1 mod poly
+%define HashKey_102 (16*41) ; HashKey^102 <<1 mod poly
+%define HashKey_101 (16*42) ; HashKey^101 <<1 mod poly
+%define HashKey_100 (16*43) ; HashKey^100 <<1 mod poly
+%define HashKey_99 (16*44) ; HashKey^99 <<1 mod poly
+%define HashKey_98 (16*45) ; HashKey^98 <<1 mod poly
+%define HashKey_97 (16*46) ; HashKey^97 <<1 mod poly
+%define HashKey_96 (16*47) ; HashKey^96 <<1 mod poly
+%define HashKey_95 (16*48) ; HashKey^95 <<1 mod poly
+%define HashKey_94 (16*49) ; HashKey^94 <<1 mod poly
+%define HashKey_93 (16*50) ; HashKey^93 <<1 mod poly
+%define HashKey_92 (16*51) ; HashKey^92 <<1 mod poly
+%define HashKey_91 (16*52) ; HashKey^91 <<1 mod poly
+%define HashKey_90 (16*53) ; HashKey^90 <<1 mod poly
+%define HashKey_89 (16*54) ; HashKey^89 <<1 mod poly
+%define HashKey_88 (16*55) ; HashKey^88 <<1 mod poly
+%define HashKey_87 (16*56) ; HashKey^87 <<1 mod poly
+%define HashKey_86 (16*57) ; HashKey^86 <<1 mod poly
+%define HashKey_85 (16*58) ; HashKey^85 <<1 mod poly
+%define HashKey_84 (16*59) ; HashKey^84 <<1 mod poly
+%define HashKey_83 (16*60) ; HashKey^83 <<1 mod poly
+%define HashKey_82 (16*61) ; HashKey^82 <<1 mod poly
+%define HashKey_81 (16*62) ; HashKey^81 <<1 mod poly
+%define HashKey_80 (16*63) ; HashKey^80 <<1 mod poly
+%define HashKey_79 (16*64) ; HashKey^79 <<1 mod poly
+%define HashKey_78 (16*65) ; HashKey^78 <<1 mod poly
+%define HashKey_77 (16*66) ; HashKey^77 <<1 mod poly
+%define HashKey_76 (16*67) ; HashKey^76 <<1 mod poly
+%define HashKey_75 (16*68) ; HashKey^75 <<1 mod poly
+%define HashKey_74 (16*69) ; HashKey^74 <<1 mod poly
+%define HashKey_73 (16*70) ; HashKey^73 <<1 mod poly
+%define HashKey_72 (16*71) ; HashKey^72 <<1 mod poly
+%define HashKey_71 (16*72) ; HashKey^71 <<1 mod poly
+%define HashKey_70 (16*73) ; HashKey^70 <<1 mod poly
+%define HashKey_69 (16*74) ; HashKey^69 <<1 mod poly
+%define HashKey_68 (16*75) ; HashKey^68 <<1 mod poly
+%define HashKey_67 (16*76) ; HashKey^67 <<1 mod poly
+%define HashKey_66 (16*77) ; HashKey^66 <<1 mod poly
+%define HashKey_65 (16*78) ; HashKey^65 <<1 mod poly
+%define HashKey_64 (16*79) ; HashKey^64 <<1 mod poly
+%define HashKey_63 (16*80) ; HashKey^63 <<1 mod poly
+%define HashKey_62 (16*81) ; HashKey^62 <<1 mod poly
+%define HashKey_61 (16*82) ; HashKey^61 <<1 mod poly
+%define HashKey_60 (16*83) ; HashKey^60 <<1 mod poly
+%define HashKey_59 (16*84) ; HashKey^59 <<1 mod poly
+%define HashKey_58 (16*85) ; HashKey^58 <<1 mod poly
+%define HashKey_57 (16*86) ; HashKey^57 <<1 mod poly
+%define HashKey_56 (16*87) ; HashKey^56 <<1 mod poly
+%define HashKey_55 (16*88) ; HashKey^55 <<1 mod poly
+%define HashKey_54 (16*89) ; HashKey^54 <<1 mod poly
+%define HashKey_53 (16*90) ; HashKey^53 <<1 mod poly
+%define HashKey_52 (16*91) ; HashKey^52 <<1 mod poly
+%define HashKey_51 (16*92) ; HashKey^51 <<1 mod poly
+%define HashKey_50 (16*93) ; HashKey^50 <<1 mod poly
+%define HashKey_49 (16*94) ; HashKey^49 <<1 mod poly
+%define HashKey_48 (16*95) ; HashKey^48 <<1 mod poly
+%define HashKey_47 (16*96) ; HashKey^47 <<1 mod poly
+%define HashKey_46 (16*97) ; HashKey^46 <<1 mod poly
+%define HashKey_45 (16*98) ; HashKey^45 <<1 mod poly
+%define HashKey_44 (16*99) ; HashKey^44 <<1 mod poly
+%define HashKey_43 (16*100) ; HashKey^43 <<1 mod poly
+%define HashKey_42 (16*101) ; HashKey^42 <<1 mod poly
+%define HashKey_41 (16*102) ; HashKey^41 <<1 mod poly
+%define HashKey_40 (16*103) ; HashKey^40 <<1 mod poly
+%define HashKey_39 (16*104) ; HashKey^39 <<1 mod poly
+%define HashKey_38 (16*105) ; HashKey^38 <<1 mod poly
+%define HashKey_37 (16*106) ; HashKey^37 <<1 mod poly
+%define HashKey_36 (16*107) ; HashKey^36 <<1 mod poly
+%define HashKey_35 (16*108) ; HashKey^35 <<1 mod poly
+%define HashKey_34 (16*109) ; HashKey^34 <<1 mod poly
+%define HashKey_33 (16*110) ; HashKey^33 <<1 mod poly
+%define HashKey_32 (16*111) ; HashKey^32 <<1 mod poly
+%define HashKey_31 (16*112) ; HashKey^31 <<1 mod poly
+%define HashKey_30 (16*113) ; HashKey^30 <<1 mod poly
+%define HashKey_29 (16*114) ; HashKey^29 <<1 mod poly
+%define HashKey_28 (16*115) ; HashKey^28 <<1 mod poly
+%define HashKey_27 (16*116) ; HashKey^27 <<1 mod poly
+%define HashKey_26 (16*117) ; HashKey^26 <<1 mod poly
+%define HashKey_25 (16*118) ; HashKey^25 <<1 mod poly
+%define HashKey_24 (16*119) ; HashKey^24 <<1 mod poly
+%define HashKey_23 (16*120) ; HashKey^23 <<1 mod poly
+%define HashKey_22 (16*121) ; HashKey^22 <<1 mod poly
+%define HashKey_21 (16*122) ; HashKey^21 <<1 mod poly
+%define HashKey_20 (16*123) ; HashKey^20 <<1 mod poly
+%define HashKey_19 (16*124) ; HashKey^19 <<1 mod poly
+%define HashKey_18 (16*125) ; HashKey^18 <<1 mod poly
+%define HashKey_17 (16*126) ; HashKey^17 <<1 mod poly
+%define HashKey_16 (16*127) ; HashKey^16 <<1 mod poly
+%define HashKey_15 (16*128) ; HashKey^15 <<1 mod poly
+%define HashKey_14 (16*129) ; HashKey^14 <<1 mod poly
+%define HashKey_13 (16*130) ; HashKey^13 <<1 mod poly
+%define HashKey_12 (16*131) ; HashKey^12 <<1 mod poly
+%define HashKey_11 (16*132) ; HashKey^11 <<1 mod poly
+%define HashKey_10 (16*133) ; HashKey^10 <<1 mod poly
+%define HashKey_9 (16*134) ; HashKey^9 <<1 mod poly
+%define HashKey_8 (16*135) ; HashKey^8 <<1 mod poly
+%define HashKey_7 (16*136) ; HashKey^7 <<1 mod poly
+%define HashKey_6 (16*137) ; HashKey^6 <<1 mod poly
+%define HashKey_5 (16*138) ; HashKey^5 <<1 mod poly
+%define HashKey_4 (16*139) ; HashKey^4 <<1 mod poly
+%define HashKey_3 (16*140) ; HashKey^3 <<1 mod poly
+%define HashKey_2 (16*141) ; HashKey^2 <<1 mod poly
+%define HashKey_1 (16*142) ; HashKey <<1 mod poly
+%define HashKey (16*142) ; HashKey <<1 mod poly
+%else
+;;
+;; Key structure holds up to 48 ghash keys
+;;
+%define HashKey_48 (16*15) ; HashKey^48 <<1 mod poly
+%define HashKey_47 (16*16) ; HashKey^47 <<1 mod poly
+%define HashKey_46 (16*17) ; HashKey^46 <<1 mod poly
+%define HashKey_45 (16*18) ; HashKey^45 <<1 mod poly
+%define HashKey_44 (16*19) ; HashKey^44 <<1 mod poly
+%define HashKey_43 (16*20) ; HashKey^43 <<1 mod poly
+%define HashKey_42 (16*21) ; HashKey^42 <<1 mod poly
+%define HashKey_41 (16*22) ; HashKey^41 <<1 mod poly
+%define HashKey_40 (16*23) ; HashKey^40 <<1 mod poly
+%define HashKey_39 (16*24) ; HashKey^39 <<1 mod poly
+%define HashKey_38 (16*25) ; HashKey^38 <<1 mod poly
+%define HashKey_37 (16*26) ; HashKey^37 <<1 mod poly
+%define HashKey_36 (16*27) ; HashKey^36 <<1 mod poly
+%define HashKey_35 (16*28) ; HashKey^35 <<1 mod poly
+%define HashKey_34 (16*29) ; HashKey^34 <<1 mod poly
+%define HashKey_33 (16*30) ; HashKey^33 <<1 mod poly
+%define HashKey_32 (16*31) ; HashKey^32 <<1 mod poly
+%define HashKey_31 (16*32) ; HashKey^31 <<1 mod poly
+%define HashKey_30 (16*33) ; HashKey^30 <<1 mod poly
+%define HashKey_29 (16*34) ; HashKey^29 <<1 mod poly
+%define HashKey_28 (16*35) ; HashKey^28 <<1 mod poly
+%define HashKey_27 (16*36) ; HashKey^27 <<1 mod poly
+%define HashKey_26 (16*37) ; HashKey^26 <<1 mod poly
+%define HashKey_25 (16*38) ; HashKey^25 <<1 mod poly
+%define HashKey_24 (16*39) ; HashKey^24 <<1 mod poly
+%define HashKey_23 (16*40) ; HashKey^23 <<1 mod poly
+%define HashKey_22 (16*41) ; HashKey^22 <<1 mod poly
+%define HashKey_21 (16*42) ; HashKey^21 <<1 mod poly
+%define HashKey_20 (16*43) ; HashKey^20 <<1 mod poly
+%define HashKey_19 (16*44) ; HashKey^19 <<1 mod poly
+%define HashKey_18 (16*45) ; HashKey^18 <<1 mod poly
+%define HashKey_17 (16*46) ; HashKey^17 <<1 mod poly
+%define HashKey_16 (16*47) ; HashKey^16 <<1 mod poly
+%define HashKey_15 (16*48) ; HashKey^15 <<1 mod poly
+%define HashKey_14 (16*49) ; HashKey^14 <<1 mod poly
+%define HashKey_13 (16*50) ; HashKey^13 <<1 mod poly
+%define HashKey_12 (16*51) ; HashKey^12 <<1 mod poly
+%define HashKey_11 (16*52) ; HashKey^11 <<1 mod poly
+%define HashKey_10 (16*53) ; HashKey^10 <<1 mod poly
+%define HashKey_9 (16*54) ; HashKey^9 <<1 mod poly
+%define HashKey_8 (16*55) ; HashKey^8 <<1 mod poly
+%define HashKey_7 (16*56) ; HashKey^7 <<1 mod poly
+%define HashKey_6 (16*57) ; HashKey^6 <<1 mod poly
+%define HashKey_5 (16*58) ; HashKey^5 <<1 mod poly
+%define HashKey_4 (16*59) ; HashKey^4 <<1 mod poly
+%define HashKey_3 (16*60) ; HashKey^3 <<1 mod poly
+%define HashKey_2 (16*61) ; HashKey^2 <<1 mod poly
+%define HashKey_1 (16*62) ; HashKey <<1 mod poly
+%define HashKey (16*62) ; HashKey <<1 mod poly
+%endif ; !GCM_BIG_DATA
+
+%endif ; GCM_KEYS_VAES_AVX512_INCLUDED
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm
new file mode 100644
index 000000000..6f71e43fa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm
@@ -0,0 +1,184 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern aes_gcm_init_128_sse
+extern aes_gcm_init_128_avx_gen4
+extern aes_gcm_init_128_avx_gen2
+
+extern aes_gcm_enc_128_sse
+extern aes_gcm_enc_128_avx_gen4
+extern aes_gcm_enc_128_avx_gen2
+extern aes_gcm_enc_128_update_sse
+extern aes_gcm_enc_128_update_avx_gen4
+extern aes_gcm_enc_128_update_avx_gen2
+extern aes_gcm_enc_128_finalize_sse
+extern aes_gcm_enc_128_finalize_avx_gen4
+extern aes_gcm_enc_128_finalize_avx_gen2
+
+extern aes_gcm_dec_128_sse
+extern aes_gcm_dec_128_avx_gen4
+extern aes_gcm_dec_128_avx_gen2
+extern aes_gcm_dec_128_update_sse
+extern aes_gcm_dec_128_update_avx_gen4
+extern aes_gcm_dec_128_update_avx_gen2
+extern aes_gcm_dec_128_finalize_sse
+extern aes_gcm_dec_128_finalize_avx_gen4
+extern aes_gcm_dec_128_finalize_avx_gen2
+
+extern aes_gcm_precomp_128_sse
+extern aes_gcm_precomp_128_avx_gen4
+extern aes_gcm_precomp_128_avx_gen2
+
+extern aes_gcm_init_256_sse
+extern aes_gcm_init_256_avx_gen4
+extern aes_gcm_init_256_avx_gen2
+
+extern aes_gcm_enc_256_sse
+extern aes_gcm_enc_256_avx_gen4
+extern aes_gcm_enc_256_avx_gen2
+extern aes_gcm_enc_256_update_sse
+extern aes_gcm_enc_256_update_avx_gen4
+extern aes_gcm_enc_256_update_avx_gen2
+extern aes_gcm_enc_256_finalize_sse
+extern aes_gcm_enc_256_finalize_avx_gen4
+extern aes_gcm_enc_256_finalize_avx_gen2
+
+extern aes_gcm_dec_256_sse
+extern aes_gcm_dec_256_avx_gen4
+extern aes_gcm_dec_256_avx_gen2
+extern aes_gcm_dec_256_update_sse
+extern aes_gcm_dec_256_update_avx_gen4
+extern aes_gcm_dec_256_update_avx_gen2
+extern aes_gcm_dec_256_finalize_sse
+extern aes_gcm_dec_256_finalize_avx_gen4
+extern aes_gcm_dec_256_finalize_avx_gen2
+
+extern aes_gcm_precomp_256_sse
+extern aes_gcm_precomp_256_avx_gen4
+extern aes_gcm_precomp_256_avx_gen2
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern aes_gcm_precomp_128_vaes_avx512
+extern aes_gcm_init_128_vaes_avx512
+extern aes_gcm_enc_128_update_vaes_avx512
+extern aes_gcm_dec_128_update_vaes_avx512
+extern aes_gcm_enc_128_finalize_vaes_avx512
+extern aes_gcm_dec_128_finalize_vaes_avx512
+extern aes_gcm_enc_128_vaes_avx512
+extern aes_gcm_dec_128_vaes_avx512
+
+extern aes_gcm_precomp_256_vaes_avx512
+extern aes_gcm_init_256_vaes_avx512
+extern aes_gcm_enc_256_update_vaes_avx512
+extern aes_gcm_dec_256_update_vaes_avx512
+extern aes_gcm_enc_256_finalize_vaes_avx512
+extern aes_gcm_dec_256_finalize_vaes_avx512
+extern aes_gcm_enc_256_vaes_avx512
+extern aes_gcm_dec_256_vaes_avx512
+%endif
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface aes_gcm_init_128
+mbin_dispatch_init7 aes_gcm_init_128, aes_gcm_init_128_sse, aes_gcm_init_128_sse, aes_gcm_init_128_avx_gen2, aes_gcm_init_128_avx_gen4, aes_gcm_init_128_avx_gen4, aes_gcm_init_128_vaes_avx512
+
+mbin_interface aes_gcm_enc_128
+mbin_dispatch_init7 aes_gcm_enc_128, aes_gcm_enc_128_sse, aes_gcm_enc_128_sse, aes_gcm_enc_128_avx_gen2, aes_gcm_enc_128_avx_gen4, aes_gcm_enc_128_avx_gen4, aes_gcm_enc_128_vaes_avx512
+
+mbin_interface aes_gcm_enc_128_update
+mbin_dispatch_init7 aes_gcm_enc_128_update, aes_gcm_enc_128_update_sse, aes_gcm_enc_128_update_sse, aes_gcm_enc_128_update_avx_gen2, aes_gcm_enc_128_update_avx_gen4, aes_gcm_enc_128_update_avx_gen4, aes_gcm_enc_128_update_vaes_avx512
+
+mbin_interface aes_gcm_enc_128_finalize
+mbin_dispatch_init7 aes_gcm_enc_128_finalize, aes_gcm_enc_128_finalize_sse, aes_gcm_enc_128_finalize_sse, aes_gcm_enc_128_finalize_avx_gen2, aes_gcm_enc_128_finalize_avx_gen4, aes_gcm_enc_128_finalize_avx_gen4, aes_gcm_enc_128_finalize_vaes_avx512
+
+mbin_interface aes_gcm_dec_128
+mbin_dispatch_init7 aes_gcm_dec_128, aes_gcm_dec_128_sse, aes_gcm_dec_128_sse, aes_gcm_dec_128_avx_gen2, aes_gcm_dec_128_avx_gen4, aes_gcm_dec_128_avx_gen4, aes_gcm_dec_128_vaes_avx512
+
+mbin_interface aes_gcm_dec_128_update
+mbin_dispatch_init7 aes_gcm_dec_128_update, aes_gcm_dec_128_update_sse, aes_gcm_dec_128_update_sse, aes_gcm_dec_128_update_avx_gen2, aes_gcm_dec_128_update_avx_gen4, aes_gcm_dec_128_update_avx_gen4, aes_gcm_dec_128_update_vaes_avx512
+
+mbin_interface aes_gcm_dec_128_finalize
+mbin_dispatch_init7 aes_gcm_dec_128_finalize, aes_gcm_dec_128_finalize_sse, aes_gcm_dec_128_finalize_sse, aes_gcm_dec_128_finalize_avx_gen2, aes_gcm_dec_128_finalize_avx_gen4, aes_gcm_dec_128_finalize_avx_gen4, aes_gcm_dec_128_finalize_vaes_avx512
+
+mbin_interface aes_gcm_precomp_128
+mbin_dispatch_init7 aes_gcm_precomp_128, aes_gcm_precomp_128_sse, aes_gcm_precomp_128_sse, aes_gcm_precomp_128_avx_gen2, aes_gcm_precomp_128_avx_gen4, aes_gcm_precomp_128_avx_gen4, aes_gcm_precomp_128_vaes_avx512
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface aes_gcm_init_256
+mbin_dispatch_init7 aes_gcm_init_256, aes_gcm_init_256_sse, aes_gcm_init_256_sse, aes_gcm_init_256_avx_gen2, aes_gcm_init_256_avx_gen4, aes_gcm_init_256_avx_gen4, aes_gcm_init_256_vaes_avx512
+
+mbin_interface aes_gcm_enc_256
+mbin_dispatch_init7 aes_gcm_enc_256, aes_gcm_enc_256_sse, aes_gcm_enc_256_sse, aes_gcm_enc_256_avx_gen2, aes_gcm_enc_256_avx_gen4, aes_gcm_enc_256_avx_gen4, aes_gcm_enc_256_vaes_avx512
+
+mbin_interface aes_gcm_enc_256_update
+mbin_dispatch_init7 aes_gcm_enc_256_update, aes_gcm_enc_256_update_sse, aes_gcm_enc_256_update_sse, aes_gcm_enc_256_update_avx_gen2, aes_gcm_enc_256_update_avx_gen4, aes_gcm_enc_256_update_avx_gen4, aes_gcm_enc_256_update_vaes_avx512
+
+mbin_interface aes_gcm_enc_256_finalize
+mbin_dispatch_init7 aes_gcm_enc_256_finalize, aes_gcm_enc_256_finalize_sse, aes_gcm_enc_256_finalize_sse, aes_gcm_enc_256_finalize_avx_gen2, aes_gcm_enc_256_finalize_avx_gen4, aes_gcm_enc_256_finalize_avx_gen4, aes_gcm_enc_256_finalize_vaes_avx512
+
+mbin_interface aes_gcm_dec_256
+mbin_dispatch_init7 aes_gcm_dec_256, aes_gcm_dec_256_sse, aes_gcm_dec_256_sse, aes_gcm_dec_256_avx_gen2, aes_gcm_dec_256_avx_gen4, aes_gcm_dec_256_avx_gen4, aes_gcm_dec_256_vaes_avx512
+
+mbin_interface aes_gcm_dec_256_update
+mbin_dispatch_init7 aes_gcm_dec_256_update, aes_gcm_dec_256_update_sse, aes_gcm_dec_256_update_sse, aes_gcm_dec_256_update_avx_gen2, aes_gcm_dec_256_update_avx_gen4, aes_gcm_dec_256_update_avx_gen4, aes_gcm_dec_256_update_vaes_avx512
+
+mbin_interface aes_gcm_dec_256_finalize
+mbin_dispatch_init7 aes_gcm_dec_256_finalize, aes_gcm_dec_256_finalize_sse, aes_gcm_dec_256_finalize_sse, aes_gcm_dec_256_finalize_avx_gen2, aes_gcm_dec_256_finalize_avx_gen4, aes_gcm_dec_256_finalize_avx_gen4, aes_gcm_dec_256_finalize_vaes_avx512
+
+mbin_interface aes_gcm_precomp_256
+mbin_dispatch_init7 aes_gcm_precomp_256, aes_gcm_precomp_256_sse, aes_gcm_precomp_256_sse, aes_gcm_precomp_256_avx_gen2, aes_gcm_precomp_256_avx_gen4, aes_gcm_precomp_256_avx_gen4, aes_gcm_precomp_256_vaes_avx512
+
+
+;;; func core, ver, snum
+slversion aes_gcm_enc_128, 00, 00, 02c0
+slversion aes_gcm_dec_128, 00, 00, 02c1
+slversion aes_gcm_init_128, 00, 00, 02c2
+slversion aes_gcm_enc_128_update, 00, 00, 02c3
+slversion aes_gcm_dec_128_update, 00, 00, 02c4
+slversion aes_gcm_enc_128_finalize, 00, 00, 02c5
+slversion aes_gcm_dec_128_finalize, 00, 00, 02c6
+slversion aes_gcm_enc_256, 00, 00, 02d0
+slversion aes_gcm_dec_256, 00, 00, 02d1
+slversion aes_gcm_init_256, 00, 00, 02d2
+slversion aes_gcm_enc_256_update, 00, 00, 02d3
+slversion aes_gcm_dec_256_update, 00, 00, 02d4
+slversion aes_gcm_enc_256_finalize, 00, 00, 02d5
+slversion aes_gcm_dec_256_finalize, 00, 00, 02d6
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm
new file mode 100644
index 000000000..4c5083173
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary_nt.asm
@@ -0,0 +1,118 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern aes_gcm_enc_128_sse_nt
+extern aes_gcm_enc_128_avx_gen4_nt
+extern aes_gcm_enc_128_avx_gen2_nt
+extern aes_gcm_enc_128_update_sse_nt
+extern aes_gcm_enc_128_update_avx_gen4_nt
+extern aes_gcm_enc_128_update_avx_gen2_nt
+
+extern aes_gcm_dec_128_sse_nt
+extern aes_gcm_dec_128_avx_gen4_nt
+extern aes_gcm_dec_128_avx_gen2_nt
+extern aes_gcm_dec_128_update_sse_nt
+extern aes_gcm_dec_128_update_avx_gen4_nt
+extern aes_gcm_dec_128_update_avx_gen2_nt
+
+extern aes_gcm_enc_256_sse_nt
+extern aes_gcm_enc_256_avx_gen4_nt
+extern aes_gcm_enc_256_avx_gen2_nt
+extern aes_gcm_enc_256_update_sse_nt
+extern aes_gcm_enc_256_update_avx_gen4_nt
+extern aes_gcm_enc_256_update_avx_gen2_nt
+
+extern aes_gcm_dec_256_sse_nt
+extern aes_gcm_dec_256_avx_gen4_nt
+extern aes_gcm_dec_256_avx_gen2_nt
+extern aes_gcm_dec_256_update_sse_nt
+extern aes_gcm_dec_256_update_avx_gen4_nt
+extern aes_gcm_dec_256_update_avx_gen2_nt
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern aes_gcm_enc_128_update_vaes_avx512_nt
+extern aes_gcm_dec_128_update_vaes_avx512_nt
+extern aes_gcm_enc_128_vaes_avx512_nt
+extern aes_gcm_dec_128_vaes_avx512_nt
+
+extern aes_gcm_enc_256_update_vaes_avx512_nt
+extern aes_gcm_dec_256_update_vaes_avx512_nt
+extern aes_gcm_enc_256_vaes_avx512_nt
+extern aes_gcm_dec_256_vaes_avx512_nt
+%endif
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aes_gcm NT interfaces enc, enc_update, dec, dec_update
+;;;;
+mbin_interface aes_gcm_enc_128_nt
+mbin_dispatch_init7 aes_gcm_enc_128_nt, aes_gcm_enc_128_sse_nt, aes_gcm_enc_128_sse_nt, aes_gcm_enc_128_avx_gen2_nt, aes_gcm_enc_128_avx_gen4_nt, aes_gcm_enc_128_avx_gen4_nt, aes_gcm_enc_128_vaes_avx512_nt
+
+mbin_interface aes_gcm_enc_128_update_nt
+mbin_dispatch_init7 aes_gcm_enc_128_update_nt, aes_gcm_enc_128_update_sse_nt, aes_gcm_enc_128_update_sse_nt, aes_gcm_enc_128_update_avx_gen2_nt, aes_gcm_enc_128_update_avx_gen4_nt, aes_gcm_enc_128_update_avx_gen4_nt, aes_gcm_enc_128_update_vaes_avx512_nt
+
+mbin_interface aes_gcm_dec_128_nt
+mbin_dispatch_init7 aes_gcm_dec_128_nt, aes_gcm_dec_128_sse_nt, aes_gcm_dec_128_sse_nt, aes_gcm_dec_128_avx_gen2_nt, aes_gcm_dec_128_avx_gen4_nt, aes_gcm_dec_128_avx_gen4_nt, aes_gcm_dec_128_vaes_avx512_nt
+
+mbin_interface aes_gcm_dec_128_update_nt
+mbin_dispatch_init7 aes_gcm_dec_128_update_nt, aes_gcm_dec_128_update_sse_nt, aes_gcm_dec_128_update_sse_nt, aes_gcm_dec_128_update_avx_gen2_nt, aes_gcm_dec_128_update_avx_gen4_nt, aes_gcm_dec_128_update_avx_gen4_nt, aes_gcm_dec_128_update_vaes_avx512_nt
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface aes_gcm_enc_256_nt
+mbin_dispatch_init7 aes_gcm_enc_256_nt, aes_gcm_enc_256_sse_nt, aes_gcm_enc_256_sse_nt, aes_gcm_enc_256_avx_gen2_nt, aes_gcm_enc_256_avx_gen4_nt, aes_gcm_enc_256_avx_gen4_nt, aes_gcm_enc_256_vaes_avx512_nt
+
+mbin_interface aes_gcm_enc_256_update_nt
+mbin_dispatch_init7 aes_gcm_enc_256_update_nt, aes_gcm_enc_256_update_sse_nt, aes_gcm_enc_256_update_sse_nt, aes_gcm_enc_256_update_avx_gen2_nt, aes_gcm_enc_256_update_avx_gen4_nt, aes_gcm_enc_256_update_avx_gen4_nt, aes_gcm_enc_256_update_vaes_avx512_nt
+
+mbin_interface aes_gcm_dec_256_nt
+mbin_dispatch_init7 aes_gcm_dec_256_nt, aes_gcm_dec_256_sse_nt, aes_gcm_dec_256_sse_nt, aes_gcm_dec_256_avx_gen2_nt, aes_gcm_dec_256_avx_gen4_nt, aes_gcm_dec_256_avx_gen4_nt, aes_gcm_dec_256_vaes_avx512_nt
+
+mbin_interface aes_gcm_dec_256_update_nt
+mbin_dispatch_init7 aes_gcm_dec_256_update_nt, aes_gcm_dec_256_update_sse_nt, aes_gcm_dec_256_update_sse_nt, aes_gcm_dec_256_update_avx_gen2_nt, aes_gcm_dec_256_update_avx_gen4_nt, aes_gcm_dec_256_update_avx_gen4_nt, aes_gcm_dec_256_update_vaes_avx512_nt
+
+
+;;; func core, ver, snum
+slversion aes_gcm_enc_128_nt, 00, 00, 02e1
+slversion aes_gcm_dec_128_nt, 00, 00, 02e2
+slversion aes_gcm_enc_128_update_nt, 00, 00, 02e3
+slversion aes_gcm_dec_128_update_nt, 00, 00, 02e4
+slversion aes_gcm_enc_256_nt, 00, 00, 02e5
+slversion aes_gcm_dec_256_nt, 00, 00, 02e6
+slversion aes_gcm_enc_256_update_nt, 00, 00, 02e7
+slversion aes_gcm_dec_256_update_nt, 00, 00, 02e8
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c
new file mode 100644
index 000000000..529d36b31
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_rand_test.c
@@ -0,0 +1,2038 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include <openssl/sha.h>
+#include "gcm_vectors.h"
+#include "ossl_helper.h"
+#include "types.h"
+
+//#define GCM_VECTORS_VERBOSE
+//#define GCM_VECTORS_EXTRA_VERBOSE
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN 32*1024
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN (4*1024)
+#endif
+
+// NT versions require 64B alignment
+# define NT_ALIGNMENT (64)
+# define ALIGNMENT_MASK (~(NT_ALIGNMENT - 1))
+# define OFFSET_BASE_VALUE (NT_ALIGNMENT)
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED (1)
+#endif
+
+void dump_table(char *title, uint8_t * table, uint8_t count)
+{
+ int i;
+ char const *space = " ";
+
+ printf("%s%s => {\n", space, title);
+ for (i = 0; i < count; i++) {
+ if (0 == (i & 15))
+ printf("%s%s", space, space);
+ printf("%2x, ", table[i]);
+ if (15 == (i & 15))
+ printf("\n");
+
+ }
+ printf("%s}\n", space);
+}
+
+void dump_gcm_data(struct gcm_key_data *gkey)
+{
+#ifdef GCM_VECTORS_EXTRA_VERBOSE
+ printf("gcm_data {\n");
+ dump_table("expanded_keys", gkey->expanded_keys, (16 * 11));
+ dump_table("shifted_hkey_1", gkey->shifted_hkey_1, 16);
+ dump_table("shifted_hkey_2", gkey->shifted_hkey_2, 16);
+ dump_table("shifted_hkey_3", gkey->shifted_hkey_3, 16);
+ dump_table("shifted_hkey_4", gkey->shifted_hkey_4, 16);
+ dump_table("shifted_hkey_5", gkey->shifted_hkey_5, 16);
+ dump_table("shifted_hkey_6", gkey->shifted_hkey_6, 16);
+ dump_table("shifted_hkey_7", gkey->shifted_hkey_7, 16);
+ dump_table("shifted_hkey_8", gkey->shifted_hkey_8, 16);
+ dump_table("shifted_hkey_1_k", gkey->shifted_hkey_1_k, 16);
+ dump_table("shifted_hkey_2_k", gkey->shifted_hkey_2_k, 16);
+ dump_table("shifted_hkey_3_k", gkey->shifted_hkey_3_k, 16);
+ dump_table("shifted_hkey_4_k", gkey->shifted_hkey_4_k, 16);
+ dump_table("shifted_hkey_5_k", gkey->shifted_hkey_5_k, 16);
+ dump_table("shifted_hkey_6_k", gkey->shifted_hkey_6_k, 16);
+ dump_table("shifted_hkey_7_k", gkey->shifted_hkey_7_k, 16);
+ dump_table("shifted_hkey_8_k", gkey->shifted_hkey_8_k, 16);
+ printf("}\n");
+#endif //GCM_VECTORS_VERBOSE
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 1;
+ printf(" expected results don't match %s \t\t", data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ int ret;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+ if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+ || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_enc_128_nt(gkey, gctx, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_dec_128_nt(gkey, gctx, vector->P, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ if (vector->Plen != 0) {
+ aligned_free(pt_test);
+ aligned_free(ct_test);
+ aligned_free(o_ct_test);
+ }
+
+ return OK;
+}
+
+int check_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector, int test_len)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break;
+ int i, ret;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+ if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+ || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+
+ last_break = 0;
+ i = (rand() % test_len / 8) & ALIGNMENT_MASK;
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ ret = posix_memalign((void **)&stream, 64, (i - last_break));
+ if ((ret != 0) || (stream == NULL)) {
+ OK = 1;
+ fprintf(stderr, "posix_memalign failed\n");
+ break;
+ }
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+ aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ aligned_free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ i = (rand() % test_len / 8) & ALIGNMENT_MASK;
+
+ }
+ aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ if (gctx->in_length != vector->Plen)
+ printf("%lu, %lu\n", gctx->in_length, vector->Plen);
+ aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = 0;
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (rand() % (test_len / 64) == 0) {
+ if (i - last_break != 0) {
+ ret = posix_memalign((void **)&stream, 64, i - last_break);
+ if ((ret != 0) || (stream == NULL)) {
+ OK = 1;
+ fprintf(stderr, "posix_memalign failed\n");
+ break;
+ }
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+ aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ aligned_free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ if (vector->Plen != 0) {
+ aligned_free(pt_test);
+ aligned_free(ct_test);
+ aligned_free(o_ct_test);
+ }
+ free(rand_data);
+
+ return OK;
+}
+
+int check_strm_vector2(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector, int length, int start, int breaks)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break = 0;
+ int i = length;
+ uint8_t *rand_data = NULL;
+ int ret;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ ret = posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+ if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+ || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_enc_128_nt(gkey, gctx, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ ret = posix_memalign((void **)&stream, 64, i - last_break);
+ if ((ret != 0) || (stream == NULL)) {
+ OK = 1;
+ fprintf(stderr, "posix_memalign failed\n");
+ break;
+ }
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+ aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ aligned_free(stream);
+ last_break = i;
+ i = i + (length - start) / breaks;
+
+ }
+ aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = length;
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ ret = posix_memalign((void **)&stream, 64, i - last_break);
+ if ((ret != 0) || (stream == NULL)) {
+ OK = 1;
+ fprintf(stderr, "posix_memalign failed\n");
+ break;
+ }
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+ aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ aligned_free(stream);
+ last_break = i;
+ i = i + (length - start) / breaks;
+
+ }
+
+ aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(rand_data);
+ if (vector->Plen != 0) {
+ free(pt_test);
+ free(ct_test);
+ aligned_free(o_ct_test);
+ }
+
+ return OK;
+}
+
+int check_strm_vector_efence(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break = 0;
+ int i = 1;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+ int ret;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+ if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+ || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < vector->Plen) {
+ if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+ ret = posix_memalign((void **)&stream, 64, PAGE_LEN);
+ if ((ret != 0) || (stream == NULL)) {
+ OK = 1;
+ fprintf(stderr, "posix_memalign failed\n");
+ break;
+ }
+ i = i & ALIGNMENT_MASK;
+ memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break,
+ i - last_break);
+ aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break,
+ stream + PAGE_LEN - (i - last_break),
+ i - last_break);
+ aligned_free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aes_gcm_enc_128_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = 0;
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < vector->Plen) {
+ if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+ ret = posix_memalign((void **)&stream, 64, PAGE_LEN);
+ if ((ret != 0) || (stream == NULL)) {
+ OK = 1;
+ fprintf(stderr, "posix_memalign failed\n");
+ break;
+ }
+ i = i & ALIGNMENT_MASK;
+ memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break,
+ i - last_break);
+ aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break,
+ stream + PAGE_LEN - (i - last_break),
+ i - last_break);
+ aligned_free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aes_gcm_dec_128_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_128_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ if (vector->Plen != 0) {
+ aligned_free(pt_test);
+ aligned_free(ct_test);
+ aligned_free(o_ct_test);
+ }
+ free(rand_data);
+
+ return OK;
+}
+
+int check_256_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ int ret;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+ if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+ || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_256(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_enc_256_nt(gkey, gctx, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_dec_256_nt(gkey, gctx, vector->P, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted ISA-L plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_256_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted OpenSSL plain text (P)");
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ if (vector->Plen != 0) {
+ aligned_free(pt_test);
+ aligned_free(ct_test);
+ aligned_free(o_ct_test);
+ }
+
+ return OK;
+}
+
+int check_256_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector, int test_len)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break;
+ int i, ret;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ ret = posix_memalign((void **)&pt_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&ct_test, 64, vector->Plen);
+ ret |= posix_memalign((void **)&o_ct_test, 64, vector->Plen);
+ if ((ret != 0) || (pt_test == NULL) || (ct_test == NULL)
+ || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_256(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen);
+
+ last_break = 0;
+ i = (rand() % test_len / 8) & ALIGNMENT_MASK;
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ ret = posix_memalign((void **)&stream, 64, i - last_break);
+ if ((ret != 0) || (stream == NULL)) {
+ OK = 1;
+ fprintf(stderr, "posix_memalign failed\n");
+ break;
+ }
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+
+ aes_gcm_enc_256_update_nt(gkey, gctx, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ i += (rand() % test_len / 8) & ALIGNMENT_MASK;
+
+ }
+ aes_gcm_enc_256_update_nt(gkey, gctx, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ if (gctx->in_length != vector->Plen)
+ printf("%lu, %lu\n", gctx->in_length, vector->Plen);
+ aes_gcm_enc_256_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i += (rand() % test_len / 8) & ALIGNMENT_MASK;
+ aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ ret = posix_memalign((void **)&stream, 64, i - last_break);
+ if ((ret != 0) || (stream == NULL)) {
+ OK = 1;
+ fprintf(stderr, "posix_memalign failed\n");
+ break;
+ }
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+
+ aes_gcm_dec_256_update_nt(gkey, gctx, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ aligned_free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+ i += (rand() % test_len / 8) & ALIGNMENT_MASK;
+
+ }
+ aes_gcm_dec_256_update_nt(gkey, gctx, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aes_gcm_dec_256_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+ OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted ISA-L plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_256_nt(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted OpenSSL plain text (P)");
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ if (vector->Plen != 0) {
+ aligned_free(pt_test);
+ aligned_free(ct_test);
+ aligned_free(o_ct_test);
+ }
+
+ return OK;
+}
+
+int test_gcm_strm_efence(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+ int ret;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES GCM random efence test vectors with random stream:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+ ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ ret = posix_memalign((void **)&test.P, 64, 16);
+ ret |= posix_memalign((void **)&test.C, 64, 16);
+ }
+ if (ret != 0) {
+ printf("posix_memalign for testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector_efence(gkey, gctx, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ aligned_free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ aligned_free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkey);
+ free(gctx);
+ return 0;
+}
+
+int test_gcm_strm_combinations(int test_len)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ uint8_t *gkeytemp = NULL;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+ int ret;
+
+ gkeytemp = malloc(sizeof(struct gcm_key_data) + 64);
+ gctx = malloc(sizeof(struct gcm_context_data));
+ gkey = (struct gcm_key_data *)(gkeytemp + rand() % 64);
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES GCM random test vectors with random stream of average size %d:",
+ test_len / 64);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = 0; // (rand() % test_len);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % test_len);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+ ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ ret = posix_memalign((void **)&test.P, 64, 16);
+ ret |= posix_memalign((void **)&test.C, 64, 16);
+ }
+ if (ret != 0) {
+ printf("posix_memalign for testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector(gkey, gctx, &test, test_len))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ aligned_free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ aligned_free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkeytemp);
+ free(gctx);
+ return 0;
+}
+
+int test_gcm_combinations(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+ int ret;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES GCM random test vectors:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+ ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ ret = posix_memalign((void **)&test.P, 64, 16);
+ ret |= posix_memalign((void **)&test.C, 64, 16);
+ }
+ if (ret != 0) {
+ printf("posix_memalign for testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_vector(gkey, gctx, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ aligned_free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ aligned_free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkey);
+ free(gctx);
+ return 0;
+}
+
+int test_gcm256_combinations(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+ int ret;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES-GCM-256 random test vectors:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+ ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ ret = posix_memalign((void **)&test.P, 64, 16);
+ ret |= posix_memalign((void **)&test.C, 64, 16);
+ }
+ if (ret != 0) {
+ printf("posix_memalign for testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+ test.K = malloc(GCM_256_KEY_LEN + offset);
+ test.Klen = GCM_256_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_256_vector(gkey, gctx, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ aligned_free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ aligned_free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkey);
+ free(gctx);
+ return 0;
+}
+
+int test_gcm256_strm_combinations(int test_len)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ uint8_t *gkeytemp = NULL;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+ int ret;
+
+ gkeytemp = malloc(sizeof(struct gcm_key_data) + 64);
+ gctx = malloc(sizeof(struct gcm_context_data));
+ gkey = (struct gcm_key_data *)(gkeytemp + rand() % 64);
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES-GCM-256 random test vectors with random stream of average size %d:",
+ test_len / 64);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % test_len);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % test_len);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+ ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ ret = posix_memalign((void **)&test.P, 64, 16);
+ ret |= posix_memalign((void **)&test.C, 64, 16);
+ }
+ if (ret != 0) {
+ printf("posix_memalign for testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+ test.K = malloc(GCM_256_KEY_LEN + offset);
+ test.Klen = GCM_256_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_256_strm_vector(gkey, gctx, &test, test_len))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ aligned_free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ aligned_free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkeytemp);
+ free(gctx);
+ return 0;
+}
+
+//
+// place all data to end at a page boundary to check for read past the end
+//
+int test_gcm_efence(void)
+{
+ gcm_vector test;
+ int offset = 0;
+ gcm_key_size key_len;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+ uint8_t *P = NULL, *C = NULL, *K, *IV, *A, *T;
+ int ret;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ ret = posix_memalign((void **)&P, 64, PAGE_LEN);
+ ret |= posix_memalign((void **)&C, 64, PAGE_LEN);
+ K = malloc(PAGE_LEN);
+ IV = malloc(PAGE_LEN);
+ A = malloc(PAGE_LEN);
+ T = malloc(PAGE_LEN);
+ if ((0 != ret) || (NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV)
+ || (NULL == A) || (NULL == T) || (NULL == gkey) || (NULL == gctx)) {
+ printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+ return -1;
+ }
+
+ test.Plen = PAGE_LEN / 2;
+ // place buffers to end at page boundary
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.Alen = test.Plen;
+ test.Tlen = MAX_TAG_LEN;
+
+ printf("AES GCM efence test vectors:");
+ for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len;
+ key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) {
+ test.Klen = key_len;
+ for (offset = 0; MAX_UNALIGNED > offset; offset++) {
+ if (0 == (offset % 80))
+ printf("\n");
+ // move the start and size of the data block towards the end of the page
+ test.Plen = (PAGE_LEN / 2) - offset;
+ test.Alen = (PAGE_LEN / 4) - (offset * 4); //lengths must be a multiple of 4 bytes
+ //Place data at end of page
+ test.P = P + PAGE_LEN - test.Plen;
+ test.C = C + PAGE_LEN - test.Plen;
+ test.K = K + PAGE_LEN - test.Klen;
+ test.IV = IV + PAGE_LEN - test.IVlen;
+ test.A = A + PAGE_LEN - test.Alen;
+ test.T = T + PAGE_LEN - test.Tlen;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+ if (GCM_128_KEY_LEN == key_len) {
+ if (0 != check_vector(gkey, gctx, &test))
+ return 1;
+ } else {
+ if (0 != check_256_vector(gkey, gctx, &test))
+ return 1;
+ }
+ }
+ }
+ free(gkey);
+ free(gctx);
+ free(P);
+ free(C);
+ free(K);
+ free(IV);
+ free(A);
+ free(T);
+
+ printf("\n");
+ return 0;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int ret;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("AES-GCM-128:\n");
+#endif
+
+ // Allocate space for the calculated ciphertext
+ ret = posix_memalign((void **)&ct_test, 64, vector->Plen);
+ // Allocate space for the calculated plaintext
+ ret |= posix_memalign((void **)&pt_test, 64, vector->Plen);
+ if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (T2_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, &gkey);
+#ifdef GCM_VECTORS_VERBOSE
+ dump_gcm_data(&gkey);
+#endif
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, pt_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)");
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_128_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_128_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+ // OpenSSl enc -> ISA-L dec
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |=
+ check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "OpenSSL->ISA-L decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+ // ISA-L enc -> OpenSSl dec
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ ct_test, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+ if (NULL != ct_test)
+ aligned_free(ct_test);
+ if (NULL != pt_test)
+ aligned_free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int ret;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("AES-GCM-256:\n");
+#endif
+
+ // Allocate space for the calculated ciphertext
+ ret = posix_memalign((void **)&ct_test, 64, vector->Plen);
+ // Allocate space for the calculated plaintext
+ ret |= posix_memalign((void **)&pt_test, 64, vector->Plen);
+ if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if (T_test == NULL) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_256(vector->K, &gkey);
+#ifdef GCM_VECTORS_VERBOSE
+ dump_gcm_data(&gkey);
+#endif
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, pt_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)");
+ OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)");
+ OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)");
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_256_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_256_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+ // OpenSSl enc -> ISA-L dec
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |=
+ check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "OpenSSL->ISA-L decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+ // ISA-L enc -> OpenSSl dec
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ ct_test, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+ if (NULL != ct_test)
+ aligned_free(ct_test);
+ if (NULL != pt_test)
+ aligned_free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm_std_vectors(void)
+{
+ int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+ int vect;
+ int OK = 0;
+
+ printf("AES-GCM standard test vectors:\n");
+ for (vect = 0; vect < vectors_cnt; vect++) {
+#ifdef GCM_VECTORS_VERBOSE
+ printf
+ ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+ (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+ (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+ printf(".");
+#endif
+
+ if (BITS_128 == gcm_vectors[vect].Klen) {
+ OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+ } else {
+ OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+ }
+ if (0 != OK)
+ return OK;
+ }
+ printf("\n");
+ return OK;
+}
+
+// The length of the data is set to length. The first stream is from 0 to start. After
+// that the data is broken into breaks chunks of equal size (except possibly the last
+// one due to divisibility).
+int test_gcm_strm_combinations2(int length, int start, int breaks)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+ int ret;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES GCM random test vectors of length %d and stream with %d breaks:", length,
+ breaks + 1);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = length;
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ ret = posix_memalign((void **)&test.P, 64, test.Plen + offset);
+ ret |= posix_memalign((void **)&test.C, 64, test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ ret = posix_memalign((void **)&test.P, 64, 16);
+ ret |= posix_memalign((void **)&test.C, 64, 16);
+ }
+ if (ret != 0) {
+ printf("posix_memalign for testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector2(gkey, gctx, &test, length, start, breaks))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ aligned_free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ aligned_free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkey);
+ free(gctx);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int errors = 0;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ errors += test_gcm_std_vectors();
+ errors += test_gcm256_combinations();
+ errors += test_gcm_combinations();
+ errors += test_gcm_efence();
+ errors += test_gcm256_strm_combinations(TEST_LEN);
+ errors += test_gcm_strm_combinations(TEST_LEN);
+ errors += test_gcm256_strm_combinations(1024);
+ errors += test_gcm_strm_combinations(1024);
+ errors += test_gcm_strm_efence();
+ errors += test_gcm_strm_combinations2(1024, 0, 1024);
+
+ if (0 == errors)
+ printf("...Pass\n");
+ else
+ printf("...Fail\n");
+
+ return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c
new file mode 100644
index 000000000..19c0cc447
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_nt_std_vectors_test.c
@@ -0,0 +1,322 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include "gcm_vectors.h"
+#include "types.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 1;
+ printf(" expected results don't match %s \t\t", data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+int test_gcm128_std_vectors_nt(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int ret;
+
+ // Allocate space for the calculated ciphertext
+ ret = posix_memalign((void **)&ct_test, 32, vector->Plen);
+ // Allocate space for the calculated plaintext
+ ret |= posix_memalign((void **)&pt_test, 32, vector->Plen);
+ if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (T2_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, &gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_128_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_128_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_128_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_128_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ memset(pt_test, 0, vector->Plen);
+
+ if (NULL != ct_test)
+ aligned_free(ct_test);
+ if (NULL != pt_test)
+ aligned_free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm256_std_vectors_nt(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int ret;
+
+ // Allocate space for the calculated ciphertext
+ ret = posix_memalign((void **)&ct_test, 32, vector->Plen);
+ // Allocate space for the calculated plaintext
+ ret |= posix_memalign((void **)&pt_test, 32, vector->Plen);
+ if ((ret != 0) || (ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if (T_test == NULL) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_256(vector->K, &gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_256_nt(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ memset(pt_test, 0, vector->Plen);
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_256_nt(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_256_nt(&gkey, &gctx, ct_test, pt_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_256_nt(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ if (NULL != ct_test)
+ aligned_free(ct_test);
+ if (NULL != pt_test)
+ aligned_free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm_std_vectors_nt(void)
+{
+ int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+ int vect;
+ int OK = 0;
+
+ printf("AES-GCM standard test vectors NT:\n");
+ for (vect = 0; (vect < vectors_cnt); vect++) {
+#ifdef DEBUG
+ printf("Standard vector NT %d/%d"
+ " Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+ (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+ (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+ printf(".");
+#endif
+ if (BITS_128 == gcm_vectors[vect].Klen)
+ OK |= test_gcm128_std_vectors_nt(&gcm_vectors[vect]);
+ else
+ OK |= test_gcm256_std_vectors_nt(&gcm_vectors[vect]);
+ if (0 != OK)
+ return OK;
+ }
+ printf("\n");
+ return OK;
+}
+
+int main(int argc, char **argv)
+{
+ int errors = 0;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ errors += test_gcm_std_vectors_nt();
+
+ if (0 == errors)
+ printf("...Pass\n");
+ else
+ printf("...Fail\n");
+
+ return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c
new file mode 100644
index 000000000..a9e9c5914
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c
@@ -0,0 +1,272 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include <test.h>
+#include "ossl_helper.h"
+#include "gcm_vectors.h"
+
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define AAD_LENGTH 16
+#define TEST_MEM TEST_LEN
+
+static unsigned char *plaintext, *gcm_plaintext, *cyphertext, *ossl_plaintext,
+ *ossl_cyphertext, *gcm_tag, *ossl_tag, *IV, *AAD;
+static uint8_t key128[GCM_128_KEY_LEN];
+static uint8_t key256[GCM_256_KEY_LEN];
+uint8_t iv_len = 0;
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ unsigned int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, int vect, char *data_name)
+{
+ int mismatch;
+ int OK = 1;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 0;
+ printf(" v[%d] expected results don't match %s \t\t", vect, data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+void aes_gcm_perf(void)
+{
+ struct gcm_key_data gkey, gkey256;
+ struct gcm_context_data gctx;
+ int i;
+
+ printf
+ ("AES GCM performance parameters plain text length:%d; IV length:%d; ADD length:%d \n",
+ TEST_LEN, GCM_IV_LEN, AAD_LENGTH);
+
+ mk_rand_data(key128, sizeof(key128));
+ mk_rand_data(key256, sizeof(key256));
+
+ // This is only required once for a given key
+ aes_gcm_pre_128(key128, &gkey);
+ aes_gcm_pre_256(key256, &gkey256);
+
+ // Preload code cache
+ aes_gcm_enc_128(&gkey, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH,
+ gcm_tag, MAX_TAG_LEN);
+ openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN,
+ plaintext, TEST_LEN, ossl_cyphertext);
+ check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0,
+ "ISA-L vs OpenSSL 128 key cypher text (C)");
+ check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 128 tag (T)");
+ aes_gcm_enc_256(&gkey256, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH,
+ gcm_tag, MAX_TAG_LEN);
+ openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN,
+ plaintext, TEST_LEN, ossl_cyphertext);
+ check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0,
+ "ISA-L vs OpenSSL 256 cypher text (C)");
+ check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 256 tag (T)");
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_gcm_enc_128(&gkey, &gctx, cyphertext, plaintext, TEST_LEN, IV, AAD,
+ AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN,
+ cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_gcm_dec_128(&gkey, &gctx, plaintext, cyphertext, TEST_LEN, IV,
+ AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0, "ISA-L check of tag (T)");
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm_dec" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_gcm_dec(key128, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN,
+ plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_gcm_dec" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ printf("\n");
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_gcm_enc_256(&gkey256, &gctx, cyphertext, plaintext, TEST_LEN, IV,
+ AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm256_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN,
+ cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_256_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_gcm_dec_256(&gkey256, &gctx, plaintext, cyphertext, TEST_LEN, IV,
+ AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0,
+ "ISA-L check of 256 tag (T)");
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm256_dec" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_gcm_dec(key256, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN,
+ plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_256_gcm_dec" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+}
+
+int main(void)
+{
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint32_t OK = 1;
+
+ plaintext = malloc(TEST_LEN);
+ gcm_plaintext = malloc(TEST_LEN);
+ cyphertext = malloc(TEST_LEN);
+ ossl_plaintext = malloc(TEST_LEN + 16);
+ ossl_cyphertext = malloc(TEST_LEN);
+ gcm_tag = malloc(MAX_TAG_LEN);
+ ossl_tag = malloc(MAX_TAG_LEN);
+ AAD = malloc(AAD_LENGTH);
+ IV = malloc(GCM_IV_LEN);
+ if ((NULL == plaintext) || (NULL == cyphertext) || (NULL == gcm_plaintext)
+ || (NULL == ossl_plaintext) || (NULL == ossl_cyphertext)
+ || (NULL == gcm_tag) || (NULL == ossl_tag) || (NULL == AAD) || (NULL == IV)) {
+ printf("malloc of testsize:0x%x failed\n", TEST_LEN);
+ return -1;
+ }
+
+ mk_rand_data(plaintext, TEST_LEN);
+ mk_rand_data(AAD, AAD_LENGTH);
+ mk_rand_data(IV, GCM_IV_LEN);
+ memcpy(&IV[GCM_IV_END_START], IVend, sizeof(IVend));
+ iv_len = GCM_IV_LEN - sizeof(IVend); //end marker not part of IV length
+
+ aes_gcm_perf();
+ printf("AES gcm ISA-L vs OpenSSL performance\n");
+
+ return !OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c
new file mode 100644
index 000000000..ee064ef6c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c
@@ -0,0 +1,61 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <aes_gcm.h>
+#include <aes_keyexp.h>
+
+void aes_keyexp_128_enc(const void *, uint8_t *);
+void aes_gcm_precomp_128(struct gcm_key_data *key_data);
+void aes_gcm_precomp_256(struct gcm_key_data *key_data);
+
+void aes_gcm_pre_128(const void *key, struct gcm_key_data *key_data)
+{
+ aes_keyexp_128_enc(key, key_data->expanded_keys);
+ aes_gcm_precomp_128(key_data);
+}
+
+void aes_gcm_pre_256(const void *key, struct gcm_key_data *key_data)
+{
+ uint8_t tmp_exp_key[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+ aes_keyexp_256((const uint8_t *)key, (uint8_t *) key_data->expanded_keys, tmp_exp_key);
+ aes_gcm_precomp_256(key_data);
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver aes_gcm_pre_128_slver_000002c7;
+struct slver aes_gcm_pre_128_slver = { 0x02c7, 0x00, 0x00 };
+
+struct slver aes_gcm_pre_256_slver_000002d7;
+struct slver aes_gcm_pre_256_slver = { 0x02d7, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c
new file mode 100644
index 000000000..4b7ca9736
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_simple_example.c
@@ -0,0 +1,78 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include "aes_gcm.h"
+
+#define TXT_SIZE 8
+#define AAD_SIZE 32
+#define TAG_SIZE 16 /* Valid values are 16, 12, or 8 */
+#define KEY_SIZE GCM_256_KEY_LEN
+#define IV_SIZE GCM_IV_DATA_LEN
+
+void mprint(const char *msg, uint8_t * buf, int len)
+{
+ int i;
+ printf("%s", msg);
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 32 == 0)
+ printf("\n");
+ }
+ printf("\n");
+}
+
+int main(void)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ uint8_t ct[TXT_SIZE], pt[TXT_SIZE], pt2[TXT_SIZE]; // Cipher text and plain text
+ uint8_t iv[IV_SIZE], aad[AAD_SIZE], key[KEY_SIZE]; // Key and authentication data
+ uint8_t tag1[TAG_SIZE], tag2[TAG_SIZE]; // Authentication tags for encode and decode
+
+ printf("gcm example:\n");
+ memset(key, 0, KEY_SIZE);
+ memset(pt, 0, TXT_SIZE);
+ memset(iv, 0, IV_SIZE);
+ memset(aad, 0, AAD_SIZE);
+
+ aes_gcm_pre_256(key, &gkey);
+ aes_gcm_enc_256(&gkey, &gctx, ct, pt, TXT_SIZE, iv, aad, AAD_SIZE, tag1, TAG_SIZE);
+ aes_gcm_dec_256(&gkey, &gctx, pt2, ct, TXT_SIZE, iv, aad, AAD_SIZE, tag2, TAG_SIZE);
+
+ mprint(" input text: ", pt, TXT_SIZE);
+ mprint(" cipher text: ", ct, TXT_SIZE);
+ mprint(" decode text: ", pt2, TXT_SIZE);
+ mprint(" ath tag1 (enc): ", tag1, TAG_SIZE);
+ mprint(" ath tag2 (dec): ", tag2, TAG_SIZE);
+
+ return memcmp(tag1, tag2, TAG_SIZE);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm
new file mode 100644
index 000000000..e35860496
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_sse.asm
@@ -0,0 +1,2171 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_sse.asm!"
+%endif
+%endif
+%endif
+
+%ifndef FUNCT_EXTENSION
+%define FUNCT_EXTENSION
+%endif
+
+%ifdef GCM128_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ sse %+ FUNCT_EXTENSION
+%define NROUNDS 9
+%endif
+
+%ifdef GCM192_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ sse %+ FUNCT_EXTENSION
+%define NROUNDS 11
+%endif
+
+%ifdef GCM256_MODE
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ sse %+ FUNCT_EXTENSION
+%define NROUNDS 13
+%endif
+
+
+default rel
+; need to push 5 registers into stack to maintain
+%define STACK_OFFSET 8*5
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqa %%T1, %%GH
+ pshufd %%T2, %%GH, 01001110b
+ pshufd %%T3, %%HK, 01001110b
+ pxor %%T2, %%GH ; %%T2 = (a1+a0)
+ pxor %%T3, %%HK ; %%T3 = (b1+b0)
+
+ pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T2, %%GH
+ pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%GH, %%T3
+ pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%GH
+ movdqa %%T3, %%GH
+ movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%GH
+ movdqa %%T4,%%GH
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T5
+ pxor %%GH, %%T2
+ pxor %%GH, %%T1 ; the result is in %%T1
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ movdqa %%T4, %%HK
+ pshufd %%T1, %%HK, 01001110b
+ pxor %%T1, %%HK
+ movdqu [%%GDATA + HashKey_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly
+ movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly
+ movdqu [%%GDATA + HashKey_3], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_3_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly
+ movdqu [%%GDATA + HashKey_4], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly
+ movdqu [%%GDATA + HashKey_5], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_5_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly
+ movdqu [%%GDATA + HashKey_6], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly
+ movdqu [%%GDATA + HashKey_7], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly
+ movdqu [%%GDATA + HashKey_8], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_8_k], %%T1
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ pxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ pinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ pinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ pxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ movdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_key_data (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN), the current data offset (DATA_OFFSET),
+; and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 8
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%DATA_OFFSET %6
+%define %%AAD_HASH %7
+%define %%ENC_DEC %8
+ mov r13, [%%GDATA_CTX + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+ mov r13, [%%GDATA_CTX + PBlockLen]
+
+%%_data_read: ;Finished reading in data
+
+
+ movdqu xmm9, [%%GDATA_CTX + PBlockEncKey] ;xmm9 = ctx_data.partial_block_enc_key
+ movdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ movdqa xmm3, xmm1
+ pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pand xmm3, xmm1
+ pshufb xmm3, [SHUF_MASK]
+ pshufb xmm3, xmm2
+ pxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pshufb xmm9, [SHUF_MASK]
+ pshufb xmm9, xmm2
+ pxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA_CTX + PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ pshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 24
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%LENGTH %5
+%define %%DATA_OFFSET %6
+%define %%num_initial_blocks %7 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %8
+%define %%HASH_KEY %9
+%define %%T3 %10
+%define %%T4 %11
+%define %%T5 %12
+%define %%CTR %13
+%define %%XMM1 %14
+%define %%XMM2 %15
+%define %%XMM3 %16
+%define %%XMM4 %17
+%define %%XMM5 %18
+%define %%XMM6 %19
+%define %%XMM7 %20
+%define %%XMM8 %21
+%define %%T6 %22
+%define %%T_key %23
+%define %%ENC_DEC %24
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+
+ ; start AES for %%num_initial_blocks blocks
+ movdqu %%CTR, [%%GDATA_CTX + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa reg(i), %%CTR
+ pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+movdqu %%T_key, [%%GDATA_KEY+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ pxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep NROUNDS ; encrypt N blocks with 13 key rounds (11 for GCM192)
+movdqu %%T_key, [%%GDATA_KEY+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+movdqu %%T_key, [%%GDATA_KEY+16*j] ; encrypt with last (14th) key round (12 for GCM192)
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ pxor reg(i), %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(i), %%T1
+ %endif
+ pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ pxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ movdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM1, %%CTR
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM2, %%CTR
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM3, %%CTR
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM4, %%CTR
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM5, %%CTR
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM6, %%CTR
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM7, %%CTR
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM8, %%CTR
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu %%T_key, [%%GDATA_KEY+16*0]
+ pxor %%XMM1, %%T_key
+ pxor %%XMM2, %%T_key
+ pxor %%XMM3, %%T_key
+ pxor %%XMM4, %%T_key
+ pxor %%XMM5, %%T_key
+ pxor %%XMM6, %%T_key
+ pxor %%XMM7, %%T_key
+ pxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep NROUNDS ; do early (13) rounds (11 for GCM192)
+ movdqu %%T_key, [%%GDATA_KEY+16*i]
+ aesenc %%XMM1, %%T_key
+ aesenc %%XMM2, %%T_key
+ aesenc %%XMM3, %%T_key
+ aesenc %%XMM4, %%T_key
+ aesenc %%XMM5, %%T_key
+ aesenc %%XMM6, %%T_key
+ aesenc %%XMM7, %%T_key
+ aesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ movdqu %%T_key, [%%GDATA_KEY+16*i] ; do final key round
+ aesenclast %%XMM1, %%T_key
+ aesenclast %%XMM2, %%T_key
+ aesenclast %%XMM3, %%T_key
+ aesenclast %%XMM4, %%T_key
+ aesenclast %%XMM5, %%T_key
+ aesenclast %%XMM6, %%T_key
+ aesenclast %%XMM7, %%T_key
+ aesenclast %%XMM8, %%T_key
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ pxor %%XMM1, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM1, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ pxor %%XMM2, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM2, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ pxor %%XMM3, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM3, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ pxor %%XMM4, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM4, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ pxor %%XMM5, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM5, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ pxor %%XMM6, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM6, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ pxor %%XMM7, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM7, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ pxor %%XMM8, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA (KEY), %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ movdqa %%T7, %%XMM1
+ movdqu [rsp + TMP2], %%XMM2
+ movdqu [rsp + TMP3], %%XMM3
+ movdqu [rsp + TMP4], %%XMM4
+ movdqu [rsp + TMP5], %%XMM5
+ movdqu [rsp + TMP6], %%XMM6
+ movdqu [rsp + TMP7], %%XMM7
+ movdqu [rsp + TMP8], %%XMM8
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+
+ movdqa %%T4, %%T7
+ pshufd %%T6, %%T7, 01001110b
+ pxor %%T6, %%T7
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ %endif
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ movdqa %%XMM1, %%CTR
+
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+ %endif
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ movdqu %%T1, [%%GDATA + 16*0]
+ pxor %%XMM1, %%T1
+ pxor %%XMM2, %%T1
+ pxor %%XMM3, %%T1
+ pxor %%XMM4, %%T1
+ pxor %%XMM5, %%T1
+ pxor %%XMM6, %%T1
+ pxor %%XMM7, %%T1
+ pxor %%XMM8, %%T1
+
+ ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP2]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*1]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T1, [%%GDATA + 16*2]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqu %%T1, [rsp + TMP3]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*3]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP4]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*4]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*5]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP5]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+
+ movdqu %%T1, [%%GDATA + 16*6]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+ movdqu %%T1, [rsp + TMP6]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*7]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP7]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*8]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP8]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T7, %%T3
+ pxor %%T4, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*9]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+%ifdef GCM128_MODE
+ movdqu %%T5, [%%GDATA + 16*10]
+%endif
+%ifdef GCM192_MODE
+ movdqu %%T1, [%%GDATA + 16*10]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*11]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T5, [%%GDATA + 16*12] ; finish last key round
+%endif
+%ifdef GCM256_MODE
+ movdqu %%T1, [%%GDATA + 16*10]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*11]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*12]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*13]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T5, [%%GDATA + 16*14] ; finish last key round
+%endif
+
+%assign i 0
+%assign j 1
+%rep 8
+ XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+
+%ifidn %%ENC_DEC, DEC
+ movdqa %%T3, %%T1
+%endif
+
+ pxor %%T1, %%T5
+ aesenclast reg(j), %%T1 ; XMM1:XMM8
+ XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer
+
+%ifidn %%ENC_DEC, DEC
+ movdqa reg(j), %%T3
+%endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+
+
+ pxor %%T2, %%T6
+ pxor %%T2, %%T4
+ pxor %%T2, %%T7
+
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T3
+ pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7
+
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T1, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T1
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T1,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T1,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T1
+
+ pxor %%T2, %%T5
+ pxor %%T7, %%T2
+ pxor %%T7, %%T4 ; the result is in %%T4
+
+
+ pxor %%XMM1, %%T7
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ; Karatsuba Method
+ movdqa %%T6, %%XMM1
+ pshufd %%T2, %%XMM1, 01001110b
+ pxor %%T2, %%XMM1
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1
+
+ pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ movdqa %%T7, %%XMM1
+ movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM2
+ pshufd %%T2, %%XMM2, 01001110b
+ pxor %%T2, %%XMM2
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM2
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM3
+ pshufd %%T2, %%XMM3, 01001110b
+ pxor %%T2, %%XMM3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM3
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM4
+ pshufd %%T2, %%XMM4, 01001110b
+ pxor %%T2, %%XMM4
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM4
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM5
+ pshufd %%T2, %%XMM5, 01001110b
+ pxor %%T2, %%XMM5
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM5
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM6
+ pshufd %%T2, %%XMM6, 01001110b
+ pxor %%T2, %%XMM6
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM6
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM7
+ pshufd %%T2, %%XMM7, 01001110b
+ pxor %%T2, %%XMM7
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM7
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM8
+ pshufd %%T2, %%XMM8, 01001110b
+ pxor %%T2, %%XMM8
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM8
+ pxor %%T2, %%XMM1
+ pxor %%T2, %%T6
+ pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm
+
+
+ movdqa %%T4, %%T2
+ pslldq %%T4, 8 ; shift-L %%T4 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T4
+ pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T1, %%T2
+ psrldq %%T1, 4 ; shift-R %%T1 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T4,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T1
+ pxor %%T7, %%T2
+ pxor %%T6, %%T7 ; the result is in %%T6
+
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 3
+%define %%GDATA %1
+%define %%ST %2
+%define %%T1 %3
+ movdqu %%T1, [%%GDATA+16*0]
+ pxor %%ST, %%T1
+%assign i 1
+%rep NROUNDS
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenc %%ST, %%T1
+%assign i (i+1)
+%endrep
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenclast %%ST, %%T1
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ push rsi
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+
+ mov arg5, arg(5) ;[r14 + STACK_OFFSET + 8*5]
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop rsi
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%IV %3
+%define %%A_IN %4
+%define %%A_LEN %5
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ movdqu %%SUBHASH, [%%GDATA_KEY + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ pxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ movdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], r10 ; ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA_CTX + InLen], r10 ; ctx_data.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], r10 ; ctx_data.partial_block_length = 0
+ movdqu [%%GDATA_CTX + PBlockEncKey], xmm2 ; ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ movdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001
+ pinsrq xmm2, [r10], 0
+ pinsrd xmm2, [r10+8], 2
+ movdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv
+
+ pshufb xmm2, [SHUF_MASK]
+
+ movdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data
+; struct has been initialized by GCM_INIT.
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%CYPH_PLAIN_OUT %3
+%define %%PLAIN_CYPH_IN %4
+%define %%PLAIN_CYPH_LEN %5
+%define %%ENC_DEC %6
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ movdqu xmm13, [%%GDATA_KEY + HashKey] ; xmm13 = HashKey
+ movdqu xmm8, [%%GDATA_CTX + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+ mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ;save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ movd r15d, xmm9
+ and r15d, 255
+ pshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ pshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ pshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA_KEY, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ movdqu [%%GDATA_CTX + AadHash], xmm14
+ movdqu [%%GDATA_CTX + CurCount], xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA_CTX + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ paddd xmm9, [ONE] ; INCR CNT to get Yn
+ movdqu [%%GDATA_CTX + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9
+ pshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Yn)
+ movdqu [%%GDATA_CTX + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ movdqa xmm2, xmm1
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pand xmm2, xmm1
+ pshufb xmm2, [SHUF_MASK]
+ pxor xmm14, xmm2
+ movdqu [%%GDATA_CTX + AadHash], xmm14
+
+ %else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pshufb xmm9, [SHUF_MASK]
+ pxor xmm14, xmm9
+ movdqu [%%GDATA_CTX + AadHash], xmm14
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data * (GDATA_CTX) and
+; whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 5
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA_CTX + PBlockLen] ; r12 = aadLen (number of bytes)
+ movdqu xmm14, [%%GDATA_CTX + AadHash]
+ movdqu xmm13, [%%GDATA_KEY + HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ movdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ movd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ movq xmm1, %%PLAIN_CYPH_LEN
+ pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ pxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ pxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9, xmm2 ; E(K, Y0)
+
+ pxor xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ movq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ movq rax, xmm9
+ mov [r10], rax
+ psrldq xmm9, 8
+ movd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ movdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ;GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_sse / aes_gcm_precomp_192_sse / aes_gcm_precomp_256_sse
+; (struct gcm_key_data *key_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(precomp,_)
+FN_NAME(precomp,_):
+ endbranch
+
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ pxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey
+
+ pshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ movdqa xmm2, xmm6
+ psllq xmm6, 1
+ psrlq xmm2, 63
+ movdqa xmm1, xmm2
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ por xmm6, xmm2
+ ;reduction
+ pshufd xmm2, xmm1, 00100100b
+ pcmpeqd xmm2, [TWOONE]
+ pand xmm2, [POLY]
+ pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_sse / aes_gcm_init_192_sse / aes_gcm_init_256_sse (
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(init,_)
+FN_NAME(init,_):
+ endbranch
+
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ push arg5
+ sub rsp, 1*16
+ movdqu [rsp + 0*16],xmm6
+ mov arg5, [rsp + 1*16 + 8*3 + 8*5]
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4, arg5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+ pop arg5
+%endif
+ pop r13
+ pop r12
+ ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_sse / aes_gcm_enc_192_update_sse / aes_gcm_enc_256_update_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_update_)
+FN_NAME(enc,_update_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_256_update_sse / aes_gcm_dec_192_update_sse / aes_gcm_dec_256_update_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_update_)
+FN_NAME(dec,_update_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_sse / aes_gcm_enc_192_finalize_sse / aes_gcm_enc_256_finalize_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(enc,_finalize_)
+FN_NAME(enc,_finalize_):
+ endbranch
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp+ 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_sse / aes_gcm_dec_192_finalize_sse / aes_gcm_dec_256_finalize_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(dec,_finalize_)
+FN_NAME(dec,_finalize_):
+ endbranch
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp+ 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_sse / aes_gcm_enc_192_sse / aes_gcm_enc_256_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_)
+FN_NAME(enc,_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_sse / aes_gcm_dec_192_sse / aes_gcm_dec_256_sse
+; const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_)
+FN_NAME(dec,_):
+ endbranch
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg2, arg6, arg7, arg8
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC
+
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c
new file mode 100644
index 000000000..b0a6221d5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c
@@ -0,0 +1,1940 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include <openssl/sha.h>
+#include "gcm_vectors.h"
+#include "ossl_helper.h"
+#include "types.h"
+
+//#define GCM_VECTORS_VERBOSE
+//#define GCM_VECTORS_EXTRA_VERBOSE
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN 32*1024
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN (4*1024)
+#endif
+
+#if defined(NT_LD) || defined(NT_ST) || defined(NT_LDST)
+# define ALIGNMENT_MASK (~15)
+# define OFFSET_BASE_VALUE 16
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED (1)
+#endif
+#else
+# define ALIGNMENT_MASK (~0)
+# define OFFSET_BASE_VALUE 1
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED (16)
+#endif
+#endif
+
+void dump_table(char *title, uint8_t * table, uint8_t count)
+{
+ int i;
+ char const *space = " ";
+
+ printf("%s%s => {\n", space, title);
+ for (i = 0; i < count; i++) {
+ if (0 == (i & 15))
+ printf("%s%s", space, space);
+ printf("%2x, ", table[i]);
+ if (15 == (i & 15))
+ printf("\n");
+
+ }
+ printf("%s}\n", space);
+}
+
+void dump_gcm_data(struct gcm_key_data *gkey)
+{
+#ifdef GCM_VECTORS_EXTRA_VERBOSE
+ printf("gcm_data {\n");
+ dump_table("expanded_keys", gkey->expanded_keys, (16 * 11));
+ dump_table("shifted_hkey_1", gkey->shifted_hkey_1, 16);
+ dump_table("shifted_hkey_2", gkey->shifted_hkey_2, 16);
+ dump_table("shifted_hkey_3", gkey->shifted_hkey_3, 16);
+ dump_table("shifted_hkey_4", gkey->shifted_hkey_4, 16);
+ dump_table("shifted_hkey_5", gkey->shifted_hkey_5, 16);
+ dump_table("shifted_hkey_6", gkey->shifted_hkey_6, 16);
+ dump_table("shifted_hkey_7", gkey->shifted_hkey_7, 16);
+ dump_table("shifted_hkey_8", gkey->shifted_hkey_8, 16);
+ dump_table("shifted_hkey_1_k", gkey->shifted_hkey_1_k, 16);
+ dump_table("shifted_hkey_2_k", gkey->shifted_hkey_2_k, 16);
+ dump_table("shifted_hkey_3_k", gkey->shifted_hkey_3_k, 16);
+ dump_table("shifted_hkey_4_k", gkey->shifted_hkey_4_k, 16);
+ dump_table("shifted_hkey_5_k", gkey->shifted_hkey_5_k, 16);
+ dump_table("shifted_hkey_6_k", gkey->shifted_hkey_6_k, 16);
+ dump_table("shifted_hkey_7_k", gkey->shifted_hkey_7_k, 16);
+ dump_table("shifted_hkey_8_k", gkey->shifted_hkey_8_k, 16);
+ printf("}\n");
+#endif //GCM_VECTORS_VERBOSE
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 1;
+ printf(" expected results don't match %s \t\t", data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx, gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_enc_128(gkey, gctx, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_dec_128(gkey, gctx, vector->P, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+
+ return OK;
+}
+
+int check_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector, int test_len)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break;
+ int i;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+
+ last_break = 0;
+ i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+ aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+ }
+ aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ if (gctx->in_length != vector->Plen)
+ printf("%lu, %lu\n", gctx->in_length, vector->Plen);
+ aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = 0;
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (rand() % (test_len / 64) == 0) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+ aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+ free(rand_data);
+
+ return OK;
+}
+
+int check_strm_vector2(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector, int length, int start, int breaks)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break = 0;
+ int i = length;
+ uint8_t *rand_data = NULL;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_enc_128(gkey, gctx, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+ aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+ last_break = i;
+ i = i + (length - start) / breaks;
+
+ }
+ aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = length;
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+ aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+ last_break = i;
+ i = i + (length - start) / breaks;
+
+ }
+
+ aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(rand_data);
+
+ return OK;
+}
+
+int check_strm_vector_efence(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break = 0;
+ int i = 1;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < vector->Plen) {
+ if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+ stream = malloc(PAGE_LEN);
+ i = i & ALIGNMENT_MASK;
+ memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break,
+ i - last_break);
+ aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break,
+ stream + PAGE_LEN - (i - last_break),
+ i - last_break);
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aes_gcm_enc_128_update(gkey, gctx, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ aes_gcm_enc_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = 0;
+ aes_gcm_init_128(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < vector->Plen) {
+ if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+ stream = malloc(PAGE_LEN);
+ i = i & ALIGNMENT_MASK;
+ memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break,
+ i - last_break);
+ aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break,
+ stream + PAGE_LEN - (i - last_break),
+ i - last_break);
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aes_gcm_dec_128_update(gkey, gctx, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aes_gcm_dec_128_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_128(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+ free(rand_data);
+
+ return OK;
+}
+
+int check_256_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_256(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_enc_256(gkey, gctx, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_dec_256(gkey, gctx, vector->P, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted ISA-L plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_256(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted OpenSSL plain text (P)");
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+
+ return OK;
+}
+
+int check_256_strm_vector(struct gcm_key_data *gkey, struct gcm_context_data *gctx,
+ gcm_vector * vector, int test_len)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break;
+ int i;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_256(vector->K, gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen);
+
+ last_break = 0;
+ i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+
+ aes_gcm_enc_256_update(gkey, gctx, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+ }
+ aes_gcm_enc_256_update(gkey, gctx, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ if (gctx->in_length != vector->Plen)
+ printf("%lu, %lu\n", gctx->in_length, vector->Plen);
+ aes_gcm_enc_256_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+ aes_gcm_init_256(gkey, gctx, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+
+ aes_gcm_dec_256_update(gkey, gctx, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+ i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+ }
+ aes_gcm_dec_256_update(gkey, gctx, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aes_gcm_dec_256_finalize(gkey, gctx, vector->T, vector->Tlen);
+
+ OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted ISA-L plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aes_gcm_dec_256(gkey, gctx, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted OpenSSL plain text (P)");
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+
+ return OK;
+}
+
+int test_gcm_strm_efence(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES GCM random efence test vectors with random stream:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector_efence(gkey, gctx, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkey);
+ free(gctx);
+ return 0;
+}
+
+int test_gcm_strm_combinations(int test_len)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ uint8_t *gkeytemp = NULL;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+
+ gkeytemp = malloc(sizeof(struct gcm_key_data) + 16);
+ gctx = malloc(sizeof(struct gcm_context_data));
+ gkey = (struct gcm_key_data *)(gkeytemp + rand() % 16);
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES GCM random test vectors with random stream of average size %d:",
+ test_len / 64);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = 0; // (rand() % test_len);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % test_len);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector(gkey, gctx, &test, test_len))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkeytemp);
+ free(gctx);
+ return 0;
+}
+
+int test_gcm_combinations(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES GCM random test vectors:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_vector(gkey, gctx, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkey);
+ free(gctx);
+ return 0;
+}
+
+int test_gcm256_combinations(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES-GCM-256 random test vectors:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_256_KEY_LEN + offset);
+ test.Klen = GCM_256_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_256_vector(gkey, gctx, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkey);
+ free(gctx);
+ return 0;
+}
+
+int test_gcm256_strm_combinations(int test_len)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ uint8_t *gkeytemp = NULL;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+
+ gkeytemp = malloc(sizeof(struct gcm_key_data) + 16);
+ gctx = malloc(sizeof(struct gcm_context_data));
+ gkey = (struct gcm_key_data *)(gkeytemp + rand() % 16);
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES-GCM-256 random test vectors with random stream of average size %d:",
+ test_len / 64);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % test_len);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % test_len);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_256_KEY_LEN + offset);
+ test.Klen = GCM_256_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_256_strm_vector(gkey, gctx, &test, test_len))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkeytemp);
+ free(gctx);
+ return 0;
+}
+
+//
+// place all data to end at a page boundary to check for read past the end
+//
+int test_gcm_efence(void)
+{
+ gcm_vector test;
+ int offset = 0;
+ gcm_key_size key_len;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+ uint8_t *P, *C, *K, *IV, *A, *T;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ P = malloc(PAGE_LEN);
+ C = malloc(PAGE_LEN);
+ K = malloc(PAGE_LEN);
+ IV = malloc(PAGE_LEN);
+ A = malloc(PAGE_LEN);
+ T = malloc(PAGE_LEN);
+ if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) || (NULL == A)
+ || (NULL == T) || (NULL == gkey) || (NULL == gctx)) {
+ printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+ return -1;
+ }
+
+ test.Plen = PAGE_LEN / 2;
+ // place buffers to end at page boundary
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.Alen = test.Plen;
+ test.Tlen = MAX_TAG_LEN;
+
+ printf("AES GCM efence test vectors:");
+ for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len;
+ key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) {
+ test.Klen = key_len;
+ for (offset = 0; MAX_UNALIGNED > offset; offset++) {
+ if (0 == (offset % 80))
+ printf("\n");
+ // move the start and size of the data block towards the end of the page
+ test.Plen = (PAGE_LEN / 2) - offset;
+ test.Alen = (PAGE_LEN / 4) - (offset * 4); //lengths must be a multiple of 4 bytes
+ //Place data at end of page
+ test.P = P + PAGE_LEN - test.Plen;
+ test.C = C + PAGE_LEN - test.Plen;
+ test.K = K + PAGE_LEN - test.Klen;
+ test.IV = IV + PAGE_LEN - test.IVlen;
+ test.A = A + PAGE_LEN - test.Alen;
+ test.T = T + PAGE_LEN - test.Tlen;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+ if (GCM_128_KEY_LEN == key_len) {
+ if (0 != check_vector(gkey, gctx, &test))
+ return 1;
+ } else {
+ if (0 != check_256_vector(gkey, gctx, &test))
+ return 1;
+ }
+ }
+ }
+ free(gkey);
+ free(gctx);
+ free(P);
+ free(C);
+ free(K);
+ free(IV);
+ free(A);
+ free(T);
+
+ printf("\n");
+ return 0;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("AES-GCM-128:\n");
+#endif
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (T2_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, &gkey);
+#ifdef GCM_VECTORS_VERBOSE
+ dump_gcm_data(&gkey);
+#endif
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, pt_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)");
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+ // OpenSSl enc -> ISA-L dec
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |=
+ check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "OpenSSL->ISA-L decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+ // ISA-L enc -> OpenSSl dec
+ aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ ct_test, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+ int result;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("AES-GCM-256:\n");
+#endif
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if ((ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if (T_test == NULL) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_256(vector->K, &gkey);
+#ifdef GCM_VECTORS_VERBOSE
+ dump_gcm_data(&gkey);
+#endif
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, pt_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)");
+ OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)");
+ OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)");
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+ // OpenSSl enc -> ISA-L dec
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |=
+ check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "OpenSSL->ISA-L decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+ // ISA-L enc -> OpenSSl dec
+ aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ ct_test, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm_std_vectors(void)
+{
+ int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+ int vect;
+ int OK = 0;
+
+ printf("AES-GCM standard test vectors:\n");
+ for (vect = 0; vect < vectors_cnt; vect++) {
+#ifdef GCM_VECTORS_VERBOSE
+ printf
+ ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+ (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+ (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+ printf(".");
+#endif
+
+ if (BITS_128 == gcm_vectors[vect].Klen) {
+ OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+ } else {
+ OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+ }
+ if (0 != OK)
+ return OK;
+ }
+ printf("\n");
+ return OK;
+}
+
+// The length of the data is set to length. The first stream is from 0 to start. After
+// that the data is broken into breaks chunks of equal size (except possibly the last
+// one due to divisibility).
+int test_gcm_strm_combinations2(int length, int start, int breaks)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_key_data *gkey = NULL;
+ struct gcm_context_data *gctx = NULL;
+
+ gkey = malloc(sizeof(struct gcm_key_data));
+ gctx = malloc(sizeof(struct gcm_context_data));
+ if (NULL == gkey || NULL == gctx)
+ return 1;
+
+ printf("AES GCM random test vectors of length %d and stream with %d breaks:", length,
+ breaks + 1);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = length;
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if (offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here because openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector2(gkey, gctx, &test, length, start, breaks))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gkey);
+ free(gctx);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int errors = 0;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ errors += test_gcm_std_vectors();
+ errors += test_gcm256_combinations();
+ errors += test_gcm_combinations();
+ errors += test_gcm_efence();
+ errors += test_gcm256_strm_combinations(TEST_LEN);
+ errors += test_gcm_strm_combinations(TEST_LEN);
+ errors += test_gcm256_strm_combinations(1024);
+ errors += test_gcm_strm_combinations(1024);
+ errors += test_gcm_strm_efence();
+ errors += test_gcm_strm_combinations2(1024, 0, 1024);
+
+ if (0 == errors)
+ printf("...Pass\n");
+ else
+ printf("...Fail\n");
+
+ return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c
new file mode 100644
index 000000000..54581d6b6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c
@@ -0,0 +1,659 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include "gcm_vectors.h"
+#include "types.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 1;
+ printf(" expected results don't match %s \t\t", data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ // Allocate space for the plain text
+ pt_test = malloc(vector->Plen);
+ if ((ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the IV
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate IV memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (T2_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_128(vector->K, &gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aes_gcm_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ memset(pt_test, 0, vector->Plen);
+
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ // Allocate space for the plain text
+ pt_test = malloc(vector->Plen);
+ if ((ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the IV
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate IV memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if (T_test == NULL) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_256(vector->K, &gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aes_gcm_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+void aes_gcm_stream_enc_128(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context,
+ uint8_t * out,
+ uint8_t const *in,
+ uint64_t len,
+ uint8_t * iv,
+ uint8_t const *aad,
+ uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len)
+{
+ aes_gcm_init_128(key_data, context, iv, aad, aad_len);
+ uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in verctors
+ uint32_t i;
+ uint32_t offset = 0, dist;
+
+ for (i = 0; i < sizeof(test_sequence); i++) {
+ dist = test_sequence[i];
+ if (offset + dist > len)
+ break;
+ aes_gcm_enc_128_update(key_data, context, out + offset, in + offset, dist);
+ offset += dist;
+ }
+
+ aes_gcm_enc_128_update(key_data, context, out + offset, in + offset, len - offset);
+ aes_gcm_enc_128_finalize(key_data, context, auth_tag, auth_tag_len);
+}
+
+void aes_gcm_stream_dec_128(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context,
+ uint8_t * out,
+ uint8_t const *in,
+ uint64_t len,
+ uint8_t * iv,
+ uint8_t const *aad,
+ uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len)
+{
+ aes_gcm_init_128(key_data, context, iv, aad, aad_len);
+ uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in vectors
+ uint32_t i;
+ uint32_t offset = 0, dist;
+
+ for (i = 0; i < sizeof(test_sequence); i++) {
+ dist = test_sequence[i];
+ if (offset + dist > len)
+ break;
+ aes_gcm_dec_128_update(key_data, context, out + offset, in + offset, dist);
+ offset += dist;
+ }
+ aes_gcm_dec_128_update(key_data, context, out + offset, in + offset, len - offset);
+ aes_gcm_dec_128_finalize(key_data, context, auth_tag, auth_tag_len);
+
+}
+
+#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST)
+int test_gcm128_std_stream_vectors(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ // Allocate space for the plain text
+ pt_test = malloc(vector->Plen);
+ if ((ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the IV
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate IV memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (T2_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ memset(gkey.expanded_keys, 0, sizeof(gkey.expanded_keys));
+ aes_gcm_pre_128(vector->K, &gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+
+ aes_gcm_stream_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_stream_enc_128(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_stream_dec_128(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_stream_dec_128(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aes_gcm_stream_enc_128(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_stream_dec_128(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ memset(pt_test, 0, vector->Plen);
+
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+void aes_gcm_stream_enc_256(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context,
+ uint8_t * out,
+ uint8_t const *in,
+ uint64_t len,
+ uint8_t * iv,
+ uint8_t const *aad,
+ uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len)
+{
+ aes_gcm_init_256(key_data, context, iv, aad, aad_len);
+ uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in vectors
+ uint32_t i;
+ uint32_t offset = 0, dist;
+
+ for (i = 0; i < sizeof(test_sequence); i++) {
+ dist = test_sequence[i];
+ if (offset + dist > len)
+ break;
+ aes_gcm_enc_256_update(key_data, context, out + offset, in + offset, dist);
+ offset += dist;
+ }
+
+ aes_gcm_enc_256_update(key_data, context, out + offset, in + offset, len - offset);
+ aes_gcm_enc_256_finalize(key_data, context, auth_tag, auth_tag_len);
+
+}
+
+void aes_gcm_stream_dec_256(const struct gcm_key_data *key_data,
+ struct gcm_context_data *context,
+ uint8_t * out,
+ uint8_t const *in,
+ uint64_t len,
+ uint8_t * iv,
+ uint8_t const *aad,
+ uint64_t aad_len, uint8_t * auth_tag, uint64_t auth_tag_len)
+{
+ aes_gcm_init_256(key_data, context, iv, aad, aad_len);
+ uint8_t test_sequence[] = { 1, 12, 22, 0, 1, 12, 16 }; //sum(test_sequence) > max_Plen in vectors
+ uint32_t i;
+ uint32_t offset = 0, dist;
+
+ for (i = 0; i < sizeof(test_sequence); i++) {
+ dist = test_sequence[i];
+ if (offset + dist > len)
+ break;
+ aes_gcm_dec_256_update(key_data, context, out + offset, in + offset, dist);
+ offset += dist;
+ }
+
+ aes_gcm_dec_256_update(key_data, context, out + offset, in + offset, len - offset);
+ aes_gcm_dec_256_finalize(key_data, context, auth_tag, auth_tag_len);
+
+}
+
+int test_gcm256_std_stream_vectors(gcm_vector const *vector)
+{
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint64_t IV_alloc_len = 0;
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ // Allocate space for the plain text
+ pt_test = malloc(vector->Plen);
+ if ((ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen;
+ // Allocate space for the IV
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate IV memory\n");
+ return 1;
+ }
+ memcpy(IV_c, vector->IV, vector->IVlen);
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if (T_test == NULL) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aes_gcm_pre_256(vector->K, &gkey);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ aes_gcm_stream_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aes_gcm_stream_enc_256(&gkey, &gctx, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L encrypted tag T(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aes_gcm_stream_dec_256(&gkey, &gctx, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aes_gcm_stream_dec_256(&gkey, &gctx, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aes_gcm_stream_enc_256(&gkey, &gctx, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aes_gcm_stream_dec_256(&gkey, &gctx, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+#endif
+
+int test_gcm_std_vectors(void)
+{
+ int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+ int vect;
+ int OK = 0;
+
+ printf("AES-GCM standard test vectors new api:\n");
+ for (vect = 0; (vect < vectors_cnt); vect++) {
+#ifdef DEBUG
+ printf("Standard vector new api %d/%d"
+ " Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+ (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+ (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+ printf(".");
+#endif
+ if (BITS_128 == gcm_vectors[vect].Klen)
+ OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+ else
+ OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+ if (0 != OK)
+ return OK;
+ }
+ printf("\n");
+ return OK;
+}
+
+#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST)
+/**
+ * Stream API test with standard vectors
+ */
+int test_gcm_std_strm_vectors(void)
+{
+ int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+ int vect;
+ int OK = 0;
+
+ printf("AES-GCM standard test vectors stream api:\n");
+ for (vect = 0; (vect < vectors_cnt); vect++) {
+#ifdef DEBUG
+ printf("Standard vector stream api %d/%d"
+ " Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+ (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+ (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+ printf(".");
+#endif
+ if (BITS_128 == gcm_vectors[vect].Klen)
+ OK |= test_gcm128_std_stream_vectors(&gcm_vectors[vect]);
+ else
+ OK |= test_gcm256_std_stream_vectors(&gcm_vectors[vect]);
+ if (0 != OK)
+ return OK;
+ }
+ printf("\n");
+ return OK;
+}
+#endif
+int main(int argc, char **argv)
+{
+ int errors = 0;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ errors += test_gcm_std_vectors();
+#if !defined(NT_LD) && !defined(NT_ST) && !defined(NT_LDST)
+ errors += test_gcm_std_strm_vectors();
+#endif
+
+ if (0 == errors)
+ printf("...Pass\n");
+ else
+ printf("...Fail\n");
+
+ return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm
new file mode 100644
index 000000000..dac7c5912
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vaes_avx512.asm
@@ -0,0 +1,4296 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2018-2019, Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+; Tomasz Kantecki
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "clear_regs.asm"
+%include "gcm_keys_vaes_avx512.asm"
+%include "gcm_defines.asm"
+%include "memcpy.asm"
+%include "aes_common.asm"
+
+%ifndef GCM128_MODE
+%ifndef GCM192_MODE
+%ifndef GCM256_MODE
+%error "No GCM mode selected for gcm_avx512.asm!"
+%endif
+%endif
+%endif
+
+%ifndef FUNCT_EXTENSION
+%define FUNCT_EXTENSION
+%endif
+
+;; Decide on AES-GCM key size to compile for
+%ifdef GCM128_MODE
+%define NROUNDS 9
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _128 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION
+%endif
+
+%ifdef GCM192_MODE
+%define NROUNDS 11
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _192 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION
+%endif
+
+%ifdef GCM256_MODE
+%define NROUNDS 13
+%define FN_NAME(x,y) aes_gcm_ %+ x %+ _256 %+ y %+ vaes_avx512 %+ FUNCT_EXTENSION
+%endif
+
+%if (AS_FEATURE_LEVEL) >= 10
+
+section .text
+default rel
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Stack frame definition
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE (10*16) ; space for 10 XMM registers
+ %define GP_STORAGE ((9*8) + 24) ; space for 9 GP registers + 24 bytes for 64 byte alignment
+%else
+ %define XMM_STORAGE 0
+ %define GP_STORAGE (8*8) ; space for 7 GP registers + 1 for alignment
+%endif
+%ifdef GCM_BIG_DATA
+%define LOCAL_STORAGE (128*16) ; space for up to 128 AES blocks
+%else
+%define LOCAL_STORAGE (48*16) ; space for up to 48 AES blocks
+%endif
+
+;;; sequence is (bottom-up): GP, XMM, local
+%define STACK_GP_OFFSET 0
+%define STACK_XMM_OFFSET (STACK_GP_OFFSET + GP_STORAGE)
+%define STACK_LOCAL_OFFSET (STACK_XMM_OFFSET + XMM_STORAGE)
+%define STACK_FRAME_SIZE (STACK_LOCAL_OFFSET + LOCAL_STORAGE)
+
+;; for compatibility with stack argument definitions in gcm_defines.asm
+%define STACK_OFFSET 0
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Horizontal XOR - 4 x 128bits xored together
+%macro VHPXORI4x128 2
+%define %%REG %1 ; [in/out] ZMM with 4x128bits to xor; 128bit output
+%define %%TMP %2 ; [clobbered] ZMM temporary register
+ vextracti64x4 YWORD(%%TMP), %%REG, 1
+ vpxorq YWORD(%%REG), YWORD(%%REG), YWORD(%%TMP)
+ vextracti32x4 XWORD(%%TMP), YWORD(%%REG), 1
+ vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
+%endmacro ; VHPXORI4x128
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Horizontal XOR - 2 x 128bits xored together
+%macro VHPXORI2x128 2
+%define %%REG %1 ; [in/out] YMM/ZMM with 2x128bits to xor; 128bit output
+%define %%TMP %2 ; [clobbered] XMM/YMM/ZMM temporary register
+ vextracti32x4 XWORD(%%TMP), %%REG, 1
+ vpxorq XWORD(%%REG), XWORD(%%REG), XWORD(%%TMP)
+%endmacro ; VHPXORI2x128
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply - 1st step
+%macro VCLMUL_STEP1 6-7
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [in] previous blocks 4 to 7
+%define %%TMP %3 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TH %4 ; [out] high product
+%define %%TM %5 ; [out] medium product
+%define %%TL %6 ; [out] low product
+%define %%HKEY %7 ; [in/optional] hash key for multiplication
+
+%if %0 == 6
+ vmovdqu64 %%TMP, [%%KP + HashKey_4]
+%else
+ vmovdqa64 %%TMP, %%HKEY
+%endif
+ vpclmulqdq %%TH, %%HI, %%TMP, 0x11 ; %%T5 = a1*b1
+ vpclmulqdq %%TL, %%HI, %%TMP, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%TM, %%HI, %%TMP, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%TMP, %%HI, %%TMP, 0x10 ; %%T4 = a0*b1
+ vpxorq %%TM, %%TM, %%TMP ; [%%TH : %%TM : %%TL]
+%endmacro ; VCLMUL_STEP1
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply - 2nd step
+%macro VCLMUL_STEP2 9-11
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [out] ghash high 128 bits
+%define %%LO %3 ; [in/out] cipher text blocks 0-3 (in); ghash low 128 bits (out)
+%define %%TMP0 %4 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TMP1 %5 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TMP2 %6 ; [clobbered] ZMM/YMM/XMM temporary
+%define %%TH %7 ; [in] high product
+%define %%TM %8 ; [in] medium product
+%define %%TL %9 ; [in] low product
+%define %%HKEY %10 ; [in/optional] hash key for multiplication
+%define %%HXOR %11 ; [in/optional] type of horizontal xor (4 - 4x128; 2 - 2x128; 1 - none)
+
+%if %0 == 9
+ vmovdqu64 %%TMP0, [%%KP + HashKey_8]
+%else
+ vmovdqa64 %%TMP0, %%HKEY
+%endif
+ vpclmulqdq %%TMP1, %%LO, %%TMP0, 0x10 ; %%TMP1 = a0*b1
+ vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x11 ; %%TMP2 = a1*b1
+ vpxorq %%TH, %%TH, %%TMP2
+ vpclmulqdq %%TMP2, %%LO, %%TMP0, 0x00 ; %%TMP2 = a0*b0
+ vpxorq %%TL, %%TL, %%TMP2
+ vpclmulqdq %%TMP0, %%LO, %%TMP0, 0x01 ; %%TMP0 = a1*b0
+ vpternlogq %%TM, %%TMP1, %%TMP0, 0x96 ; %%TM = TM xor TMP1 xor TMP0
+
+ ;; finish multiplications
+ vpsrldq %%TMP2, %%TM, 8
+ vpxorq %%HI, %%TH, %%TMP2
+ vpslldq %%TMP2, %%TM, 8
+ vpxorq %%LO, %%TL, %%TMP2
+
+ ;; xor 128bit words horizontally and compute [(X8*H1) + (X7*H2) + ... ((X1+Y0)*H8]
+ ;; note: (X1+Y0) handled elsewhere
+%if %0 < 11
+ VHPXORI4x128 %%HI, %%TMP2
+ VHPXORI4x128 %%LO, %%TMP1
+%else
+%if %%HXOR == 4
+ VHPXORI4x128 %%HI, %%TMP2
+ VHPXORI4x128 %%LO, %%TMP1
+%elif %%HXOR == 2
+ VHPXORI2x128 %%HI, %%TMP2
+ VHPXORI2x128 %%LO, %%TMP1
+%endif ; HXOR
+ ;; for HXOR == 1 there is nothing to be done
+%endif ; !(%0 < 11)
+ ;; HIx holds top 128 bits
+ ;; LOx holds low 128 bits
+ ;; - further reductions to follow
+%endmacro ; VCLMUL_STEP2
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; AVX512 reduction macro
+%macro VCLMUL_REDUCE 6
+%define %%OUT %1 ; [out] zmm/ymm/xmm: result (must not be %%TMP1 or %%HI128)
+%define %%POLY %2 ; [in] zmm/ymm/xmm: polynomial
+%define %%HI128 %3 ; [in] zmm/ymm/xmm: high 128b of hash to reduce
+%define %%LO128 %4 ; [in] zmm/ymm/xmm: low 128b of hash to reduce
+%define %%TMP0 %5 ; [in] zmm/ymm/xmm: temporary register
+%define %%TMP1 %6 ; [in] zmm/ymm/xmm: temporary register
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; first phase of the reduction
+ vpclmulqdq %%TMP0, %%POLY, %%LO128, 0x01
+ vpslldq %%TMP0, %%TMP0, 8 ; shift-L 2 DWs
+ vpxorq %%TMP0, %%LO128, %%TMP0 ; first phase of the reduction complete
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; second phase of the reduction
+ vpclmulqdq %%TMP1, %%POLY, %%TMP0, 0x00
+ vpsrldq %%TMP1, %%TMP1, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq %%OUT, %%POLY, %%TMP0, 0x10
+ vpslldq %%OUT, %%OUT, 4 ; shift-L 1-DW to obtain result with no shifts
+
+ vpternlogq %%OUT, %%TMP1, %%HI128, 0x96 ; OUT/GHASH = OUT xor TMP1 xor HI128
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endmacro
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply (1 to 8 blocks) - 1st step
+%macro VCLMUL_1_TO_8_STEP1 8
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [in] ZMM ciphered blocks 4 to 7
+%define %%TMP1 %3 ; [clobbered] ZMM temporary
+%define %%TMP2 %4 ; [clobbered] ZMM temporary
+%define %%TH %5 ; [out] ZMM high product
+%define %%TM %6 ; [out] ZMM medium product
+%define %%TL %7 ; [out] ZMM low product
+%define %%NBLOCKS %8 ; [in] number of blocks to ghash (0 to 8)
+
+%if %%NBLOCKS == 8
+ VCLMUL_STEP1 %%KP, %%HI, %%TMP1, %%TH, %%TM, %%TL
+%elif %%NBLOCKS == 7
+ vmovdqu64 %%TMP2, [%%KP + HashKey_3]
+ vmovdqa64 %%TMP1, [rel mask_out_top_block]
+ vpandq %%TMP2, %%TMP1
+ vpandq %%HI, %%TMP1
+ VCLMUL_STEP1 NULL, %%HI, %%TMP1, %%TH, %%TM, %%TL, %%TMP2
+%elif %%NBLOCKS == 6
+ vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2]
+ VCLMUL_STEP1 NULL, YWORD(%%HI), YWORD(%%TMP1), \
+ YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2)
+%elif %%NBLOCKS == 5
+ vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1]
+ VCLMUL_STEP1 NULL, XWORD(%%HI), XWORD(%%TMP1), \
+ XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2)
+%else
+ vpxorq %%TH, %%TH
+ vpxorq %%TM, %%TM
+ vpxorq %%TL, %%TL
+%endif
+%endmacro ; VCLMUL_1_TO_8_STEP1
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; schoolbook multiply (1 to 8 blocks) - 2nd step
+%macro VCLMUL_1_TO_8_STEP2 10
+%define %%KP %1 ; [in] key pointer
+%define %%HI %2 ; [out] ZMM ghash high 128bits
+%define %%LO %3 ; [in/out] ZMM ciphered blocks 0 to 3 (in); ghash low 128bits (out)
+%define %%TMP0 %4 ; [clobbered] ZMM temporary
+%define %%TMP1 %5 ; [clobbered] ZMM temporary
+%define %%TMP2 %6 ; [clobbered] ZMM temporary
+%define %%TH %7 ; [in/clobbered] ZMM high sum
+%define %%TM %8 ; [in/clobbered] ZMM medium sum
+%define %%TL %9 ; [in/clobbered] ZMM low sum
+%define %%NBLOCKS %10 ; [in] number of blocks to ghash (0 to 8)
+
+%if %%NBLOCKS == 8
+ VCLMUL_STEP2 %%KP, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL
+%elif %%NBLOCKS == 7
+ vmovdqu64 %%TMP2, [%%KP + HashKey_7]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 6
+ vmovdqu64 %%TMP2, [%%KP + HashKey_6]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 5
+ vmovdqu64 %%TMP2, [%%KP + HashKey_5]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 4
+ vmovdqu64 %%TMP2, [%%KP + HashKey_4]
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 3
+ vmovdqu64 %%TMP2, [%%KP + HashKey_3]
+ vmovdqa64 %%TMP1, [rel mask_out_top_block]
+ vpandq %%TMP2, %%TMP1
+ vpandq %%LO, %%TMP1
+ VCLMUL_STEP2 NULL, %%HI, %%LO, %%TMP0, %%TMP1, %%TMP2, %%TH, %%TM, %%TL, %%TMP2, 4
+%elif %%NBLOCKS == 2
+ vmovdqu64 YWORD(%%TMP2), [%%KP + HashKey_2]
+ VCLMUL_STEP2 NULL, YWORD(%%HI), YWORD(%%LO), \
+ YWORD(%%TMP0), YWORD(%%TMP1), YWORD(%%TMP2), \
+ YWORD(%%TH), YWORD(%%TM), YWORD(%%TL), YWORD(%%TMP2), 2
+%elif %%NBLOCKS == 1
+ vmovdqu64 XWORD(%%TMP2), [%%KP + HashKey_1]
+ VCLMUL_STEP2 NULL, XWORD(%%HI), XWORD(%%LO), \
+ XWORD(%%TMP0), XWORD(%%TMP1), XWORD(%%TMP2), \
+ XWORD(%%TH), XWORD(%%TM), XWORD(%%TL), XWORD(%%TMP2), 1
+%else
+ vpxorq %%HI, %%HI
+ vpxorq %%LO, %%LO
+%endif
+%endmacro ; VCLMUL_1_TO_8_STEP2
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; GHASH 1 to 16 blocks of cipher text
+;;; - performs reduction at the end
+;;; - can take intermediate GHASH sums as input
+%macro GHASH_1_TO_16 20
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%GHASH %2 ; [out] ghash output
+%define %%T1 %3 ; [clobbered] temporary ZMM
+%define %%T2 %4 ; [clobbered] temporary ZMM
+%define %%T3 %5 ; [clobbered] temporary ZMM
+%define %%T4 %6 ; [clobbered] temporary ZMM
+%define %%T5 %7 ; [clobbered] temporary ZMM
+%define %%T6 %8 ; [clobbered] temporary ZMM
+%define %%T7 %9 ; [clobbered] temporary ZMM
+%define %%T8 %10 ; [clobbered] temporary ZMM
+%define %%T9 %11 ; [clobbered] temporary ZMM
+%define %%GH %12 ; [in/cloberred] ghash sum (high) or "no_zmm"
+%define %%GL %13 ; [in/cloberred] ghash sum (low) or "no_zmm"
+%define %%GM %14 ; [in/cloberred] ghash sum (medium) or "no_zmm"
+%define %%AAD_HASH_IN %15 ; [in] input hash value
+%define %%CIPHER_IN0 %16 ; [in] ZMM with cipher text blocks 0-3
+%define %%CIPHER_IN1 %17 ; [in] ZMM with cipher text blocks 4-7
+%define %%CIPHER_IN2 %18 ; [in] ZMM with cipher text blocks 8-11
+%define %%CIPHER_IN3 %19 ; [in] ZMM with cipher text blocks 12-15
+%define %%NUM_BLOCKS %20 ; [in] numerical value, number of blocks
+
+%define %%T0H %%T1
+%define %%T0L %%T2
+%define %%T0M1 %%T3
+%define %%T0M2 %%T4
+
+%define %%T1H %%T5
+%define %%T1L %%T6
+%define %%T1M1 %%T7
+%define %%T1M2 %%T8
+
+%define %%HK %%T9
+
+%assign hashk HashKey_ %+ %%NUM_BLOCKS
+%assign reg_idx 0
+%assign blocks_left %%NUM_BLOCKS
+
+ vpxorq %%CIPHER_IN0, %%CIPHER_IN0, %%AAD_HASH_IN
+
+%assign first_result 1
+
+%ifnidn %%GH, no_zmm
+%ifnidn %%GM, no_zmm
+%ifnidn %%GL, no_zmm
+ ;; GHASH sums passed in to be updated and
+ ;; reduced at the end
+ vmovdqa64 %%T0H, %%GH
+ vmovdqa64 %%T0L, %%GL
+ vmovdqa64 %%T0M1, %%GM
+ vpxorq %%T0M2, %%T0M2
+%assign first_result 0
+%endif
+%endif
+%endif
+
+%rep (blocks_left / 4)
+%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx
+ vmovdqu64 %%HK, [%%KP + hashk]
+%if first_result == 1
+ vpclmulqdq %%T0H, %%REG_IN, %%HK, 0x11 ; H = a1*b1
+ vpclmulqdq %%T0L, %%REG_IN, %%HK, 0x00 ; L = a0*b0
+ vpclmulqdq %%T0M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0
+ vpclmulqdq %%T0M2, %%REG_IN, %%HK, 0x10 ; TM2 = a0*b1
+%assign first_result 0
+%else
+ vpclmulqdq %%T1H, %%REG_IN, %%HK, 0x11 ; H = a1*b1
+ vpclmulqdq %%T1L, %%REG_IN, %%HK, 0x00 ; L = a0*b0
+ vpclmulqdq %%T1M1, %%REG_IN, %%HK, 0x01 ; M1 = a1*b0
+ vpclmulqdq %%T1M2, %%REG_IN, %%HK, 0x10 ; M2 = a0*b1
+ vpxorq %%T0H, %%T0H, %%T1H
+ vpxorq %%T0L, %%T0L, %%T1L
+ vpxorq %%T0M1, %%T0M1, %%T1M1
+ vpxorq %%T0M2, %%T0M2, %%T1M2
+%endif
+%undef %%REG_IN
+%assign reg_idx (reg_idx + 1)
+%assign hashk (hashk + 64)
+%assign blocks_left (blocks_left - 4)
+%endrep
+
+%if blocks_left > 0
+;; There are 1, 2 or 3 blocks left to process.
+;; It may also be that they are the only blocks to process.
+
+%xdefine %%REG_IN %%CIPHER_IN %+ reg_idx
+
+%if first_result == 1
+;; Case where %%NUM_BLOCKS = 1, 2 or 3
+%xdefine %%OUT_H %%T0H
+%xdefine %%OUT_L %%T0L
+%xdefine %%OUT_M1 %%T0M1
+%xdefine %%OUT_M2 %%T0M2
+%else
+%xdefine %%OUT_H %%T1H
+%xdefine %%OUT_L %%T1L
+%xdefine %%OUT_M1 %%T1M1
+%xdefine %%OUT_M2 %%T1M2
+%endif
+
+%if blocks_left == 1
+ vmovdqu64 XWORD(%%HK), [%%KP + hashk]
+ vpclmulqdq XWORD(%%OUT_H), XWORD(%%REG_IN), XWORD(%%HK), 0x11 ; %%TH = a1*b1
+ vpclmulqdq XWORD(%%OUT_L), XWORD(%%REG_IN), XWORD(%%HK), 0x00 ; %%TL = a0*b0
+ vpclmulqdq XWORD(%%OUT_M1), XWORD(%%REG_IN), XWORD(%%HK), 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq XWORD(%%OUT_M2), XWORD(%%REG_IN), XWORD(%%HK), 0x10 ; %%TM2 = a0*b1
+%elif blocks_left == 2
+ vmovdqu64 YWORD(%%HK), [%%KP + hashk]
+ vpclmulqdq YWORD(%%OUT_H), YWORD(%%REG_IN), YWORD(%%HK), 0x11 ; %%TH = a1*b1
+ vpclmulqdq YWORD(%%OUT_L), YWORD(%%REG_IN), YWORD(%%HK), 0x00 ; %%TL = a0*b0
+ vpclmulqdq YWORD(%%OUT_M1), YWORD(%%REG_IN), YWORD(%%HK), 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq YWORD(%%OUT_M2), YWORD(%%REG_IN), YWORD(%%HK), 0x10 ; %%TM2 = a0*b1
+%else ; blocks_left == 3
+ vmovdqu64 YWORD(%%HK), [%%KP + hashk]
+ vinserti64x2 %%HK, [%%KP + hashk + 32], 2
+ vpclmulqdq %%OUT_H, %%REG_IN, %%HK, 0x11 ; %%TH = a1*b1
+ vpclmulqdq %%OUT_L, %%REG_IN, %%HK, 0x00 ; %%TL = a0*b0
+ vpclmulqdq %%OUT_M1, %%REG_IN, %%HK, 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq %%OUT_M2, %%REG_IN, %%HK, 0x10 ; %%TM2 = a0*b1
+%endif ; blocks_left
+
+%undef %%REG_IN
+%undef %%OUT_H
+%undef %%OUT_L
+%undef %%OUT_M1
+%undef %%OUT_M2
+
+%if first_result != 1
+ vpxorq %%T0H, %%T0H, %%T1H
+ vpxorq %%T0L, %%T0L, %%T1L
+ vpxorq %%T0M1, %%T0M1, %%T1M1
+ vpxorq %%T0M2, %%T0M2, %%T1M2
+%endif
+
+%endif ; blocks_left > 0
+
+ ;; integrate TM into TH and TL
+ vpxorq %%T0M1, %%T0M1, %%T0M2
+ vpsrldq %%T1M1, %%T0M1, 8
+ vpslldq %%T1M2, %%T0M1, 8
+ vpxorq %%T0H, %%T0H, %%T1M1
+ vpxorq %%T0L, %%T0L, %%T1M2
+
+ ;; add TH and TL 128-bit words horizontally
+ VHPXORI4x128 %%T0H, %%T1M1
+ VHPXORI4x128 %%T0L, %%T1M2
+
+ ;; reduction
+ vmovdqa64 XWORD(%%HK), [rel POLY2]
+ VCLMUL_REDUCE XWORD(%%GHASH), XWORD(%%HK), \
+ XWORD(%%T0H), XWORD(%%T0L), XWORD(%%T0M1), XWORD(%%T0M2)
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+;;; Input: A and B (128-bits each, bit-reflected)
+;;; Output: C = A*B*x mod poly, (i.e. >>1 )
+;;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+;;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxorq %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxorq %%T1, %%T1, %%T3
+ vpxorq %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu64 %%T3, [rel POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxorq %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R only 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; Shift-L 1-DW to obtain result with no shifts
+
+ ; second phase of the reduction complete, the result is in %%GH
+ vpternlogq %%GH, %%T1, %%T2, 0x96 ; GH = GH xor T1 xor T2
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx512
+;;; functions, but are kept to allow users to switch cpu architectures between calls
+;;; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ vmovdqa %%T5, %%HK
+
+ ;; GHASH keys 2 to 48 or 128
+%ifdef GCM_BIG_DATA
+%assign max_hkey_idx 128
+%else
+%assign max_hkey_idx 48
+%endif
+
+%assign i 2
+%rep (max_hkey_idx - 1)
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^i<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_ %+ i], %%T5 ; [HashKey_i] = %%T5
+%assign i (i + 1)
+%endrep
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; READ_SMALL_DATA_INPUT
+;;; Packs xmm register with data when data input is less or equal to 16 bytes
+;;; Returns 0 if data has length 0
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 5
+%define %%OUTPUT %1 ; [out] xmm register
+%define %%INPUT %2 ; [in] buffer pointer to read from
+%define %%LENGTH %3 ; [in] number of bytes to read
+%define %%TMP1 %4 ; [clobbered]
+%define %%MASK %5 ; [out] k1 to k7 register to store the partial block mask
+
+ cmp %%LENGTH, 16
+ jge %%_read_small_data_ge16
+ lea %%TMP1, [rel byte_len_to_mask_table]
+%ifidn __OUTPUT_FORMAT__, win64
+ add %%TMP1, %%LENGTH
+ add %%TMP1, %%LENGTH
+ kmovw %%MASK, [%%TMP1]
+%else
+ kmovw %%MASK, [%%TMP1 + %%LENGTH*2]
+%endif
+ vmovdqu8 %%OUTPUT{%%MASK}{z}, [%%INPUT]
+ jmp %%_read_small_data_end
+%%_read_small_data_ge16:
+ VX512LDR %%OUTPUT, [%%INPUT]
+ mov %%TMP1, 0xffff
+ kmovq %%MASK, %%TMP1
+%%_read_small_data_end:
+%endmacro ; READ_SMALL_DATA_INPUT
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 18
+%define %%A_IN %1 ; [in] AAD text pointer
+%define %%A_LEN %2 ; [in] AAD length
+%define %%AAD_HASH %3 ; [out] xmm ghash value
+%define %%GDATA_KEY %4 ; [in] pointer to keys
+%define %%ZT0 %5 ; [clobbered] ZMM register
+%define %%ZT1 %6 ; [clobbered] ZMM register
+%define %%ZT2 %7 ; [clobbered] ZMM register
+%define %%ZT3 %8 ; [clobbered] ZMM register
+%define %%ZT4 %9 ; [clobbered] ZMM register
+%define %%ZT5 %10 ; [clobbered] ZMM register
+%define %%ZT6 %11 ; [clobbered] ZMM register
+%define %%ZT7 %12 ; [clobbered] ZMM register
+%define %%ZT8 %13 ; [clobbered] ZMM register
+%define %%ZT9 %14 ; [clobbered] ZMM register
+%define %%T1 %15 ; [clobbered] GP register
+%define %%T2 %16 ; [clobbered] GP register
+%define %%T3 %17 ; [clobbered] GP register
+%define %%MASKREG %18 ; [clobbered] mask register
+
+%define %%SHFMSK %%ZT9
+%define %%POLY %%ZT8
+%define %%TH %%ZT7
+%define %%TM %%ZT6
+%define %%TL %%ZT5
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxorq %%AAD_HASH, %%AAD_HASH
+
+ vmovdqa64 %%SHFMSK, [rel SHUF_MASK]
+ vmovdqa64 %%POLY, [rel POLY2]
+
+%%_get_AAD_loop128:
+ cmp %%T2, 128
+ jl %%_exit_AAD_loop128
+
+ vmovdqu64 %%ZT2, [%%T1 + 64*0] ; LO blocks (0-3)
+ vmovdqu64 %%ZT1, [%%T1 + 64*1] ; HI blocks (4-7)
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb %%ZT1, %%SHFMSK
+
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+
+ VCLMUL_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%TH, %%TM, %%TL
+ VCLMUL_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, %%ZT0, %%ZT3, %%ZT4, %%TH, %%TM, %%TL
+
+ ;; result in %%ZT1(H):%%ZT2(L)
+ ;; reduce and put the result in AAD_HASH
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \
+ XWORD(%%ZT0), XWORD(%%ZT3)
+
+ sub %%T2, 128
+ je %%_CALC_AAD_done
+
+ add %%T1, 128
+ jmp %%_get_AAD_loop128
+
+%%_exit_AAD_loop128:
+ or %%T2, %%T2
+ jz %%_CALC_AAD_done
+
+ ;; prep mask source address
+ lea %%T3, [rel byte64_len_to_mask_table]
+ lea %%T3, [%%T3 + %%T2*8]
+
+ ;; calculate number of blocks to ghash (including partial bytes)
+ add %%T2, 15
+ and %%T2, -16 ; 1 to 8 blocks possible here
+ shr %%T2, 4
+ cmp %%T2, 7
+ je %%_AAD_blocks_7
+ cmp %%T2, 6
+ je %%_AAD_blocks_6
+ cmp %%T2, 5
+ je %%_AAD_blocks_5
+ cmp %%T2, 4
+ je %%_AAD_blocks_4
+ cmp %%T2, 3
+ je %%_AAD_blocks_3
+ cmp %%T2, 2
+ je %%_AAD_blocks_2
+ cmp %%T2, 1
+ je %%_AAD_blocks_1
+ ;; fall through for 8 blocks
+
+ ;; The flow of each of these cases is identical:
+ ;; - load blocks plain text
+ ;; - shuffle loaded blocks
+ ;; - xor in current hash value into block 0
+ ;; - perform up multiplications with ghash keys
+ ;; - jump to reduction code
+%%_AAD_blocks_8:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb %%ZT1, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 8
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 8
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_7:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 %%ZT1{%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb %%ZT1, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH) ; xor in current ghash
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 7
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 7
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_6:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 YWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb YWORD(%%ZT1), YWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 6
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 6
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_5:
+ sub %%T3, (64 * 8)
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2, [%%T1 + 64*0]
+ vmovdqu8 XWORD(%%ZT1){%%MASKREG}{z}, [%%T1 + 64*1]
+ vpshufb %%ZT2, %%SHFMSK
+ vpshufb XWORD(%%ZT1), XWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 5
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 5
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_4:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb %%ZT2, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 4
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 4
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_3:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 %%ZT2{%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb %%ZT2, %%SHFMSK
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 3
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 3
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_2:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 YWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb YWORD(%%ZT2), YWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 2
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 2
+ jmp %%_AAD_blocks_done
+
+%%_AAD_blocks_1:
+ kmovq %%MASKREG, [%%T3]
+ vmovdqu8 XWORD(%%ZT2){%%MASKREG}{z}, [%%T1 + 64*0]
+ vpshufb XWORD(%%ZT2), XWORD(%%SHFMSK)
+ vpxorq %%ZT2, %%ZT2, ZWORD(%%AAD_HASH)
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT1, %%ZT0, %%ZT3, %%TH, %%TM, %%TL, 1
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT1, %%ZT2, \
+ %%ZT0, %%ZT3, %%ZT4, \
+ %%TH, %%TM, %%TL, 1
+
+%%_AAD_blocks_done:
+ ;; Multiplications have been done. Do the reduction now
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%POLY), XWORD(%%ZT1), XWORD(%%ZT2), \
+ XWORD(%%ZT0), XWORD(%%ZT3)
+%%_CALC_AAD_done:
+ ;; result in AAD_HASH
+
+%endmacro ; CALC_AAD_HASH
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; PARTIAL_BLOCK
+;;; Handles encryption/decryption and the tag partial blocks between
+;;; update calls.
+;;; Requires the input data be at least 1 byte long.
+;;; Output:
+;;; A cipher/plain of the first partial block (CYPH_PLAIN_OUT),
+;;; AAD_HASH and updated GDATA_CTX
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 22
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer
+%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length
+%define %%DATA_OFFSET %6 ; [in/out] data offset (gets updated)
+%define %%AAD_HASH %7 ; [out] updated GHASH value
+%define %%ENC_DEC %8 ; [in] cipher direction
+%define %%GPTMP0 %9 ; [clobbered] GP temporary register
+%define %%GPTMP1 %10 ; [clobbered] GP temporary register
+%define %%GPTMP2 %11 ; [clobbered] GP temporary register
+%define %%ZTMP0 %12 ; [clobbered] ZMM temporary register
+%define %%ZTMP1 %13 ; [clobbered] ZMM temporary register
+%define %%ZTMP2 %14 ; [clobbered] ZMM temporary register
+%define %%ZTMP3 %15 ; [clobbered] ZMM temporary register
+%define %%ZTMP4 %16 ; [clobbered] ZMM temporary register
+%define %%ZTMP5 %17 ; [clobbered] ZMM temporary register
+%define %%ZTMP6 %18 ; [clobbered] ZMM temporary register
+%define %%ZTMP7 %19 ; [clobbered] ZMM temporary register
+%define %%ZTMP8 %20 ; [clobbered] ZMM temporary register
+%define %%ZTMP9 %21 ; [clobbered] ZMM temporary register
+%define %%MASKREG %22 ; [clobbered] mask temporary register
+
+%define %%XTMP0 XWORD(%%ZTMP0)
+%define %%XTMP1 XWORD(%%ZTMP1)
+%define %%XTMP2 XWORD(%%ZTMP2)
+%define %%XTMP3 XWORD(%%ZTMP3)
+%define %%XTMP4 XWORD(%%ZTMP4)
+%define %%XTMP5 XWORD(%%ZTMP5)
+%define %%XTMP6 XWORD(%%ZTMP6)
+%define %%XTMP7 XWORD(%%ZTMP7)
+%define %%XTMP8 XWORD(%%ZTMP8)
+%define %%XTMP9 XWORD(%%ZTMP9)
+
+%define %%LENGTH %%GPTMP0
+%define %%IA0 %%GPTMP1
+%define %%IA1 %%GPTMP2
+
+ mov %%LENGTH, [%%GDATA_CTX + PBlockLen]
+ or %%LENGTH, %%LENGTH
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ READ_SMALL_DATA_INPUT %%XTMP0, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%IA0, %%MASKREG
+
+ ;; XTMP1 = my_ctx_data.partial_block_enc_key
+ vmovdqu64 %%XTMP1, [%%GDATA_CTX + PBlockEncKey]
+ vmovdqu64 %%XTMP2, [%%GDATA_KEY + HashKey]
+
+ ;; adjust the shuffle mask pointer to be able to shift right %%LENGTH bytes
+ ;; (16 - %%LENGTH) is the number of bytes in plaintext mod 16)
+ lea %%IA0, [rel SHIFT_MASK]
+ add %%IA0, %%LENGTH
+ vmovdqu64 %%XTMP3, [%%IA0] ; shift right shuffle mask
+ vpshufb %%XTMP1, %%XTMP3
+
+%ifidn %%ENC_DEC, DEC
+ ;; keep copy of cipher text in %%XTMP4
+ vmovdqa64 %%XTMP4, %%XTMP0
+%endif
+ vpxorq %%XTMP1, %%XTMP0 ; Cyphertext XOR E(K, Yn)
+
+ ;; Set %%IA1 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ ;; Determine if partial block is not being filled and shift mask accordingly
+ mov %%IA1, %%PLAIN_CYPH_LEN
+ add %%IA1, %%LENGTH
+ sub %%IA1, 16
+ jge %%_no_extra_mask
+ sub %%IA0, %%IA1
+%%_no_extra_mask:
+ ;; get the appropriate mask to mask out bottom %%LENGTH bytes of %%XTMP1
+ ;; - mask out bottom %%LENGTH bytes of %%XTMP1
+ vmovdqu64 %%XTMP0, [%%IA0 + ALL_F - SHIFT_MASK]
+ vpand %%XTMP1, %%XTMP0
+
+%ifidn %%ENC_DEC, DEC
+ vpand %%XTMP4, %%XTMP0
+ vpshufb %%XTMP4, [rel SHUF_MASK]
+ vpshufb %%XTMP4, %%XTMP3
+ vpxorq %%AAD_HASH, %%XTMP4
+%else
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpshufb %%XTMP1, %%XTMP3
+ vpxorq %%AAD_HASH, %%XTMP1
+%endif
+ cmp %%IA1, 0
+ jl %%_partial_incomplete
+
+ ;; GHASH computation for the last <16 Byte block
+ GHASH_MUL %%AAD_HASH, %%XTMP2, %%XTMP5, %%XTMP6, %%XTMP7, %%XTMP8, %%XTMP9
+
+ mov qword [%%GDATA_CTX + PBlockLen], 0
+
+ ;; Set %%IA1 to be the number of bytes to write out
+ mov %%IA0, %%LENGTH
+ mov %%LENGTH, 16
+ sub %%LENGTH, %%IA0
+ jmp %%_enc_dec_done
+
+%%_partial_incomplete:
+%ifidn __OUTPUT_FORMAT__, win64
+ mov %%IA0, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + PBlockLen], %%IA0
+%else
+ add [%%GDATA_CTX + PBlockLen], %%PLAIN_CYPH_LEN
+%endif
+ mov %%LENGTH, %%PLAIN_CYPH_LEN
+
+%%_enc_dec_done:
+ ;; output encrypted Bytes
+
+ lea %%IA0, [rel byte_len_to_mask_table]
+ kmovw %%MASKREG, [%%IA0 + %%LENGTH*2]
+ vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH
+
+%ifidn %%ENC_DEC, ENC
+ ;; shuffle XTMP1 back to output as ciphertext
+ vpshufb %%XTMP1, [rel SHUF_MASK]
+ vpshufb %%XTMP1, %%XTMP3
+%endif
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET]{%%MASKREG}, %%XTMP1
+ add %%DATA_OFFSET, %%LENGTH
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+%macro GHASH_SINGLE_MUL 9
+%define %%GDATA %1
+%define %%HASHKEY %2
+%define %%CIPHER %3
+%define %%STATE_11 %4
+%define %%STATE_00 %5
+%define %%STATE_MID %6
+%define %%T1 %7
+%define %%T2 %8
+%define %%FIRST %9
+
+ vmovdqu %%T1, [%%GDATA + %%HASHKEY]
+%ifidn %%FIRST, first
+ vpclmulqdq %%STATE_11, %%CIPHER, %%T1, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%STATE_00, %%CIPHER, %%T1, 0x00 ; %%T4_2 = a0*b0
+ vpclmulqdq %%STATE_MID, %%CIPHER, %%T1, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10 ; %%T5 = a0*b1
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%else
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x11
+ vpxor %%STATE_11, %%STATE_11, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x00
+ vpxor %%STATE_00, %%STATE_00, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x01
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+
+ vpclmulqdq %%T2, %%CIPHER, %%T1, 0x10
+ vpxor %%STATE_MID, %%STATE_MID, %%T2
+%endif
+
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; This macro is used to "warm-up" pipeline for GHASH_8_ENCRYPT_8_PARALLEL
+;;; macro code. It is called only for data lenghts 128 and above.
+;;; The flow is as follows:
+;;; - encrypt the initial %%num_initial_blocks blocks (can be 0)
+;;; - encrypt the next 8 blocks and stitch with
+;;; GHASH for the first %%num_initial_blocks
+;;; - the last 8th block can be partial (lengths between 129 and 239)
+;;; - partial block ciphering is handled within this macro
+;;; - top bytes of such block are cleared for
+;;; the subsequent GHASH calculations
+;;; - PBlockEncKey needs to be setup in case of multi-call
+;;; - top bytes of the block need to include encrypted counter block so that
+;;; when handling partial block case text is read and XOR'ed against it.
+;;; This needs to be in un-shuffled format.
+
+%macro INITIAL_BLOCKS 26-27
+%define %%GDATA_KEY %1 ; [in] pointer to GCM keys
+%define %%GDATA_CTX %2 ; [in] pointer to GCM context
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer
+%define %%LENGTH %5 ; [in/out] number of bytes to process
+%define %%DATA_OFFSET %6 ; [in/out] data offset
+%define %%num_initial_blocks %7 ; [in] can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%CTR %8 ; [in/out] XMM counter block
+%define %%AAD_HASH %9 ; [in/out] ZMM with AAD hash
+%define %%ZT1 %10 ; [out] ZMM cipher blocks 0-3 for GHASH
+%define %%ZT2 %11 ; [out] ZMM cipher blocks 4-7 for GHASH
+%define %%ZT3 %12 ; [clobbered] ZMM temporary
+%define %%ZT4 %13 ; [clobbered] ZMM temporary
+%define %%ZT5 %14 ; [clobbered] ZMM temporary
+%define %%ZT6 %15 ; [clobbered] ZMM temporary
+%define %%ZT7 %16 ; [clobbered] ZMM temporary
+%define %%ZT8 %17 ; [clobbered] ZMM temporary
+%define %%ZT9 %18 ; [clobbered] ZMM temporary
+%define %%ZT10 %19 ; [clobbered] ZMM temporary
+%define %%ZT11 %20 ; [clobbered] ZMM temporary
+%define %%ZT12 %21 ; [clobbered] ZMM temporary
+%define %%IA0 %22 ; [clobbered] GP temporary
+%define %%IA1 %23 ; [clobbered] GP temporary
+%define %%ENC_DEC %24 ; [in] ENC/DEC selector
+%define %%MASKREG %25 ; [clobbered] mask register
+%define %%SHUFMASK %26 ; [in] ZMM with BE/LE shuffle mask
+%define %%PARTIAL_PRESENT %27 ; [in] "no_partial_block" option can be passed here (if length is guaranteed to be > 15*16 bytes)
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T3 XWORD(%%ZT3)
+%define %%T4 XWORD(%%ZT4)
+%define %%T5 XWORD(%%ZT5)
+%define %%T6 XWORD(%%ZT6)
+%define %%T7 XWORD(%%ZT7)
+%define %%T8 XWORD(%%ZT8)
+%define %%T9 XWORD(%%ZT9)
+
+%define %%TH %%ZT10
+%define %%TM %%ZT11
+%define %%TL %%ZT12
+
+;; determine if partial block code needs to be added
+%assign partial_block_possible 1
+%if %0 > 26
+%ifidn %%PARTIAL_PRESENT, no_partial_block
+%assign partial_block_possible 0
+%endif
+%endif
+
+%if %%num_initial_blocks > 0
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+ vpaddd %%T3, %%CTR, [rel ONE]
+%elif %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%ZT3), YWORD(%%CTR), YWORD(%%CTR), 0
+ vpaddd YWORD(%%ZT3), YWORD(%%ZT3), [rel ddq_add_1234]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234]
+ vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678]
+%endif
+
+ ;; extract new counter value (%%T3)
+ ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%CTR, %%ZT3, (%%num_initial_blocks - 1)
+%else
+ vextracti32x4 %%CTR, %%ZT4, (%%num_initial_blocks - 5)
+%endif
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+
+ ;; load plain/cipher text
+ ZMM_LOAD_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (NROUNDS + 2)
+ vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT1, j, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%num_initial_blocks, NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; write cipher/plain text back to output and
+ ;; zero bytes outside the mask before hashing
+ ZMM_STORE_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm
+
+ ;; Shuffle the cipher text blocks for hashing part
+ ;; ZT5 and ZT6 are expected outputs with blocks for hashing
+%ifidn %%ENC_DEC, DEC
+ ;; Decrypt case
+ ;; - cipher blocks are in ZT5 & ZT6
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%else
+ ;; Encrypt case
+ ;; - cipher blocks are in ZT3 & ZT4
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%ZT5, %%ZT6, no_zmm, no_zmm, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%endif ; Encrypt
+
+ ;; adjust data offset and length
+ sub %%LENGTH, (%%num_initial_blocks * 16)
+ add %%DATA_OFFSET, (%%num_initial_blocks * 16)
+
+ ;; At this stage
+ ;; - ZT5:ZT6 include cipher blocks to be GHASH'ed
+
+%endif ; %%num_initial_blocks > 0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; - cipher of %%num_initial_blocks is done
+ ;; - prepare counter blocks for the next 8 blocks (ZT3 & ZT4)
+ ;; - save the last block in %%CTR
+ ;; - shuffle the blocks for AES
+ ;; - stitch encryption of the new blocks with
+ ;; GHASHING the previous blocks
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%ZT3, ZWORD(%%CTR), [rel ddq_add_1234]
+ vpaddd %%ZT4, ZWORD(%%CTR), [rel ddq_add_5678]
+ vextracti32x4 %%CTR, %%ZT4, 3
+
+ vpshufb %%ZT3, %%SHUFMASK
+ vpshufb %%ZT4, %%SHUFMASK
+
+%if partial_block_possible != 0
+ ;; get text load/store mask (assume full mask by default)
+ mov %%IA0, 0xffff_ffff_ffff_ffff
+%if %%num_initial_blocks > 0
+ ;; NOTE: 'jge' is always taken for %%num_initial_blocks = 0
+ ;; This macro is executed for lenght 128 and up,
+ ;; zero length is checked in GCM_ENC_DEC.
+ ;; We know there is partial block if:
+ ;; LENGTH - 16*num_initial_blocks < 128
+ cmp %%LENGTH, 128
+ jge %%_initial_partial_block_continue
+ mov %%IA1, rcx
+ mov rcx, 128
+ sub rcx, %%LENGTH
+ shr %%IA0, cl
+ mov rcx, %%IA1
+%%_initial_partial_block_continue:
+%endif
+ kmovq %%MASKREG, %%IA0
+ ;; load plain or cipher text (masked)
+ ZMM_LOAD_MASKED_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, %%MASKREG
+%else
+ ;; load plain or cipher text
+ ZMM_LOAD_BLOCKS_0_16 8, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm
+%endif ;; partial_block_possible
+
+ ;; === AES ROUND 0
+%assign aes_round 0
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+
+ ;; === GHASH blocks 4-7
+%if (%%num_initial_blocks > 0)
+ ;; Hash in AES state
+ vpxorq %%ZT5, %%ZT5, %%AAD_HASH
+
+ VCLMUL_1_TO_8_STEP1 %%GDATA_KEY, %%ZT6, %%ZT8, %%ZT9, \
+ %%TH, %%TM, %%TL, %%num_initial_blocks
+%endif
+
+ ;; === [1/3] of AES rounds
+
+%rep ((NROUNDS + 1) / 3)
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; %rep ((NROUNDS + 1) / 2)
+
+ ;; === GHASH blocks 0-3 and gather
+%if (%%num_initial_blocks > 0)
+ VCLMUL_1_TO_8_STEP2 %%GDATA_KEY, %%ZT6, %%ZT5, \
+ %%ZT7, %%ZT8, %%ZT9, \
+ %%TH, %%TM, %%TL, %%num_initial_blocks
+%endif
+
+ ;; === [2/3] of AES rounds
+
+%rep ((NROUNDS + 1) / 3)
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; %rep ((NROUNDS + 1) / 2)
+
+ ;; === GHASH reduction
+
+%if (%%num_initial_blocks > 0)
+ ;; [out] AAD_HASH - hash output
+ ;; [in] T8 - polynomial
+ ;; [in] T6 - high, T5 - low
+ ;; [clobbered] T9, T7 - temporary
+ vmovdqu64 %%T8, [rel POLY2]
+ VCLMUL_REDUCE XWORD(%%AAD_HASH), %%T8, %%T6, %%T5, %%T7, %%T9
+%endif
+
+ ;; === [3/3] of AES rounds
+
+%rep (((NROUNDS + 1) / 3) + 2)
+%if aes_round < (NROUNDS + 2)
+ vbroadcastf64x2 %%ZT8, [%%GDATA_KEY + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT3, %%ZT4, no_zmm, no_zmm, \
+ %%ZT8, aes_round, \
+ %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif
+%endrep ; %rep ((NROUNDS + 1) / 2)
+
+%if partial_block_possible != 0
+ ;; write cipher/plain text back to output and
+ ;; zero bytes outside the mask before hashing
+ ZMM_STORE_MASKED_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm, %%MASKREG
+ ;; check if there is partial block
+ cmp %%LENGTH, 128
+ jl %%_initial_save_partial
+ ;; adjust offset and length
+ add %%DATA_OFFSET, 128
+ sub %%LENGTH, 128
+ jmp %%_initial_blocks_done
+%%_initial_save_partial:
+ ;; partial block case
+ ;; - save the partial block in unshuffled format
+ ;; - ZT4 is partially XOR'ed with data and top bytes contain
+ ;; encrypted counter block only
+ ;; - save number of bytes process in the partial block
+ ;; - adjust offset and zero the length
+ ;; - clear top bytes of the partial block for subsequent GHASH calculations
+ vextracti32x4 [%%GDATA_CTX + PBlockEncKey], %%ZT4, 3
+ add %%DATA_OFFSET, %%LENGTH
+ sub %%LENGTH, (128 - 16)
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ xor %%LENGTH, %%LENGTH
+ vmovdqu8 %%ZT4{%%MASKREG}{z}, %%ZT4
+%%_initial_blocks_done:
+%else
+ ZMM_STORE_BLOCKS_0_16 8, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%ZT3, %%ZT4, no_zmm, no_zmm
+ add %%DATA_OFFSET, 128
+ sub %%LENGTH, 128
+%endif ;; partial_block_possible
+
+ ;; Shuffle AES result for GHASH.
+%ifidn %%ENC_DEC, DEC
+ ;; Decrypt case
+ ;; - cipher blocks are in ZT1 & ZT2
+ vpshufb %%ZT1, %%SHUFMASK
+ vpshufb %%ZT2, %%SHUFMASK
+%else
+ ;; Encrypt case
+ ;; - cipher blocks are in ZT3 & ZT4
+ vpshufb %%ZT1, %%ZT3, %%SHUFMASK
+ vpshufb %%ZT2, %%ZT4, %%SHUFMASK
+%endif ; Encrypt
+
+ ;; Current hash value is in AAD_HASH
+
+ ;; Combine GHASHed value with the corresponding ciphertext
+ vpxorq %%ZT1, %%ZT1, %%AAD_HASH
+
+%endmacro ; INITIAL_BLOCKS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
+;;; It may look similar to INITIAL_BLOCKS but its usage is different:
+;;; - first encrypts/decrypts required number of blocks and then
+;;; ghashes these blocks
+;;; - Small packets or left over data chunks (<256 bytes)
+;;; - single or multi call
+;;; - Remaining data chunks below 256 bytes (multi buffer code)
+;;;
+;;; num_initial_blocks is expected to include the partial final block
+;;; in the count.
+%macro INITIAL_BLOCKS_PARTIAL 41
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] text out pointer
+%define %%PLAIN_CYPH_IN %4 ; [in] text out pointer
+%define %%LENGTH %5 ; [in/clobbered] length in bytes
+%define %%DATA_OFFSET %6 ; [in/out] current data offset (updated)
+%define %%num_initial_blocks %7 ; [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
+%define %%CTR %8 ; [in/out] current counter value
+%define %%HASH_IN_OUT %9 ; [in/out] XMM ghash in/out value
+%define %%ENC_DEC %10 ; [in] cipher direction (ENC/DEC)
+%define %%INSTANCE_TYPE %11 ; [in] multi_call or single_call
+%define %%ZT0 %12 ; [clobbered] ZMM temporary
+%define %%ZT1 %13 ; [clobbered] ZMM temporary
+%define %%ZT2 %14 ; [clobbered] ZMM temporary
+%define %%ZT3 %15 ; [clobbered] ZMM temporary
+%define %%ZT4 %16 ; [clobbered] ZMM temporary
+%define %%ZT5 %17 ; [clobbered] ZMM temporary
+%define %%ZT6 %18 ; [clobbered] ZMM temporary
+%define %%ZT7 %19 ; [clobbered] ZMM temporary
+%define %%ZT8 %20 ; [clobbered] ZMM temporary
+%define %%ZT9 %21 ; [clobbered] ZMM temporary
+%define %%ZT10 %22 ; [clobbered] ZMM temporary
+%define %%ZT11 %23 ; [clobbered] ZMM temporary
+%define %%ZT12 %24 ; [clobbered] ZMM temporary
+%define %%ZT13 %25 ; [clobbered] ZMM temporary
+%define %%ZT14 %26 ; [clobbered] ZMM temporary
+%define %%ZT15 %27 ; [clobbered] ZMM temporary
+%define %%ZT16 %28 ; [clobbered] ZMM temporary
+%define %%ZT17 %29 ; [clobbered] ZMM temporary
+%define %%ZT18 %30 ; [clobbered] ZMM temporary
+%define %%ZT19 %31 ; [clobbered] ZMM temporary
+%define %%ZT20 %32 ; [clobbered] ZMM temporary
+%define %%ZT21 %33 ; [clobbered] ZMM temporary
+%define %%ZT22 %34 ; [clobbered] ZMM temporary
+%define %%GH %35 ; [in] ZMM ghash sum (high)
+%define %%GL %36 ; [in] ZMM ghash sum (low)
+%define %%GM %37 ; [in] ZMM ghash sum (middle)
+%define %%IA0 %38 ; [clobbered] GP temporary
+%define %%IA1 %39 ; [clobbered] GP temporary
+%define %%MASKREG %40 ; [clobbered] mask register
+%define %%SHUFMASK %41 ; [in] ZMM with BE/LE shuffle mask
+
+%define %%T1 XWORD(%%ZT1)
+%define %%T2 XWORD(%%ZT2)
+%define %%T7 XWORD(%%ZT7)
+
+%define %%CTR0 %%ZT3
+%define %%CTR1 %%ZT4
+%define %%CTR2 %%ZT8
+%define %%CTR3 %%ZT9
+
+%define %%DAT0 %%ZT5
+%define %%DAT1 %%ZT6
+%define %%DAT2 %%ZT10
+%define %%DAT3 %%ZT11
+
+%ifnidn %%GH, no_zmm
+%ifnidn %%GL, no_zmm
+%ifnidn %%GM, no_zmm
+ ;; when temporary sums are passed then zero HASH IN value
+ ;; - whatever it holds it is invalid in this case
+ vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT
+%endif
+%endif
+%endif
+ ;; Copy ghash to temp reg
+ vmovdqa64 %%T2, %%HASH_IN_OUT
+
+ ;; prepare AES counter blocks
+%if %%num_initial_blocks == 1
+ vpaddd XWORD(%%CTR0), %%CTR, [rel ONE]
+%elif %%num_initial_blocks == 2
+ vshufi64x2 YWORD(%%CTR0), YWORD(%%CTR), YWORD(%%CTR), 0
+ vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_add_1234]
+%else
+ vshufi64x2 ZWORD(%%CTR), ZWORD(%%CTR), ZWORD(%%CTR), 0
+ vpaddd %%CTR0, ZWORD(%%CTR), [rel ddq_add_1234]
+%if %%num_initial_blocks > 4
+ vpaddd %%CTR1, ZWORD(%%CTR), [rel ddq_add_5678]
+%endif
+%if %%num_initial_blocks > 8
+ vpaddd %%CTR2, %%CTR0, [rel ddq_add_8888]
+%endif
+%if %%num_initial_blocks > 12
+ vpaddd %%CTR3, %%CTR1, [rel ddq_add_8888]
+%endif
+%endif
+
+ ;; get load/store mask
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+%if %%num_initial_blocks > 12
+ sub %%IA1, 3 * 64
+%elif %%num_initial_blocks > 8
+ sub %%IA1, 2 * 64
+%elif %%num_initial_blocks > 4
+ sub %%IA1, 64
+%endif
+ kmovq %%MASKREG, [%%IA0 + %%IA1*8]
+
+ ;; extract new counter value
+ ;; shuffle the counters for AES rounds
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%CTR, %%CTR0, (%%num_initial_blocks - 1)
+%elif %%num_initial_blocks <= 8
+ vextracti32x4 %%CTR, %%CTR1, (%%num_initial_blocks - 5)
+%elif %%num_initial_blocks <= 12
+ vextracti32x4 %%CTR, %%CTR2, (%%num_initial_blocks - 9)
+%else
+ vextracti32x4 %%CTR, %%CTR3, (%%num_initial_blocks - 13)
+%endif
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+
+ ;; load plain/cipher text
+ ZMM_LOAD_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%PLAIN_CYPH_IN, %%DATA_OFFSET, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG
+
+ ;; AES rounds and XOR with plain/cipher text
+%assign j 0
+%rep (NROUNDS + 2)
+ vbroadcastf64x2 %%ZT1, [%%GDATA_KEY + (j * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%ZT1, j, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%num_initial_blocks, NROUNDS
+%assign j (j + 1)
+%endrep
+
+ ;; retrieve the last cipher counter block (partially XOR'ed with text)
+ ;; - this is needed for partial block cases
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%T1, %%CTR0, (%%num_initial_blocks - 1)
+%elif %%num_initial_blocks <= 8
+ vextracti32x4 %%T1, %%CTR1, (%%num_initial_blocks - 5)
+%elif %%num_initial_blocks <= 12
+ vextracti32x4 %%T1, %%CTR2, (%%num_initial_blocks - 9)
+%else
+ vextracti32x4 %%T1, %%CTR3, (%%num_initial_blocks - 13)
+%endif
+
+ ;; write cipher/plain text back to output and
+ ZMM_STORE_MASKED_BLOCKS_0_16 %%num_initial_blocks, %%CYPH_PLAIN_OUT, %%DATA_OFFSET, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG
+
+ ;; zero bytes outside the mask before hashing
+%if %%num_initial_blocks <= 4
+ vmovdqu8 %%CTR0{%%MASKREG}{z}, %%CTR0
+%elif %%num_initial_blocks <= 8
+ vmovdqu8 %%CTR1{%%MASKREG}{z}, %%CTR1
+%elif %%num_initial_blocks <= 12
+ vmovdqu8 %%CTR2{%%MASKREG}{z}, %%CTR2
+%else
+ vmovdqu8 %%CTR3{%%MASKREG}{z}, %%CTR3
+%endif
+
+ ;; Shuffle the cipher text blocks for hashing part
+ ;; ZT5 and ZT6 are expected outputs with blocks for hashing
+%ifidn %%ENC_DEC, DEC
+ ;; Decrypt case
+ ;; - cipher blocks are in ZT5 & ZT6
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%else
+ ;; Encrypt case
+ ;; - cipher blocks are in CTR0-CTR3
+ ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 %%num_initial_blocks, vpshufb, \
+ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \
+ %%SHUFMASK, %%SHUFMASK, %%SHUFMASK, %%SHUFMASK
+%endif ; Encrypt
+
+ ;; Extract the last block for partials and multi_call cases
+%if %%num_initial_blocks <= 4
+ vextracti32x4 %%T7, %%DAT0, %%num_initial_blocks - 1
+%elif %%num_initial_blocks <= 8
+ vextracti32x4 %%T7, %%DAT1, %%num_initial_blocks - 5
+%elif %%num_initial_blocks <= 12
+ vextracti32x4 %%T7, %%DAT2, %%num_initial_blocks - 9
+%else
+ vextracti32x4 %%T7, %%DAT3, %%num_initial_blocks - 13
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Hash all but the last block of data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; update data offset
+%if %%num_initial_blocks > 1
+ ;; The final block of data may be <16B
+ add %%DATA_OFFSET, 16 * (%%num_initial_blocks - 1)
+ sub %%LENGTH, 16 * (%%num_initial_blocks - 1)
+%endif
+
+%if %%num_initial_blocks < 16
+ ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
+ ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
+ cmp %%LENGTH, 16
+ jl %%_small_initial_partial_block
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Handle a full length final block - encrypt and hash all blocks
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ sub %%LENGTH, 16
+ add %%DATA_OFFSET, 16
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+
+ ;; Hash all of the data
+
+ ;; ZT2 - incoming AAD hash (low 128bits)
+ ;; ZT12-ZT20 - temporary registers
+ GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \
+ %%ZT17, %%ZT18, %%ZT19, %%ZT20, \
+ %%GH, %%GL, %%GM, \
+ %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, \
+ %%num_initial_blocks
+
+ jmp %%_small_initial_compute_done
+%endif ; %if %%num_initial_blocks < 16
+
+%%_small_initial_partial_block:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;;; Handle ghash for a <16B final block
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;; In this case if it's a single call to encrypt we can
+ ;; hash all of the data but if it's an init / update / finalize
+ ;; series of call we need to leave the last block if it's
+ ;; less than a full block of data.
+
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ ;; %%T1 is ciphered counter block
+ vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%T1
+
+%ifidn %%INSTANCE_TYPE, multi_call
+%assign k (%%num_initial_blocks - 1)
+%assign last_block_to_hash 1
+%else
+%assign k (%%num_initial_blocks)
+%assign last_block_to_hash 0
+%endif
+
+%if (%%num_initial_blocks > last_block_to_hash)
+
+ ;; ZT12-ZT20 - temporary registers
+ GHASH_1_TO_16 %%GDATA_KEY, %%HASH_IN_OUT, \
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, \
+ %%ZT17, %%ZT18, %%ZT19, %%ZT20, \
+ %%GH, %%GL, %%GM, \
+ %%ZT2, %%DAT0, %%DAT1, %%DAT2, %%DAT3, k
+
+ ;; just fall through no jmp needed
+%else
+ ;; Record that a reduction is not needed -
+ ;; In this case no hashes are computed because there
+ ;; is only one initial block and it is < 16B in length.
+ ;; We only need to check if a reduction is needed if
+ ;; initial_blocks == 1 and init/update/final is being used.
+ ;; In this case we may just have a partial block, and that
+ ;; gets hashed in finalize.
+
+%assign need_for_reduction 1
+%ifidn %%GH, no_zmm
+%ifidn %%GL, no_zmm
+%ifidn %%GM, no_zmm
+;; if %%GH, %%GL & %%GM not passed then reduction is not required
+%assign need_for_reduction 0
+%endif
+%endif
+%endif
+
+%if need_for_reduction == 0
+ ;; The hash should end up in HASH_IN_OUT.
+ ;; The only way we should get here is if there is
+ ;; a partial block of data, so xor that into the hash.
+ vpxorq %%HASH_IN_OUT, %%T2, %%T7
+%else
+ ;; right - here we have nothing to ghash in the small data but
+ ;; we have GHASH sums passed through that we need to gather and reduce
+
+ ;; integrate TM into TH and TL
+ vpsrldq %%ZT12, %%GM, 8
+ vpslldq %%ZT13, %%GM, 8
+ vpxorq %%GH, %%GH, %%ZT12
+ vpxorq %%GL, %%GL, %%ZT13
+
+ ;; add TH and TL 128-bit words horizontally
+ VHPXORI4x128 %%GH, %%ZT12
+ VHPXORI4x128 %%GL, %%ZT13
+
+ ;; reduction
+ vmovdqa64 XWORD(%%ZT12), [rel POLY2]
+ VCLMUL_REDUCE %%HASH_IN_OUT, XWORD(%%ZT12), \
+ XWORD(%%GH), XWORD(%%GL), XWORD(%%ZT13), XWORD(%%ZT14)
+
+ vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7
+%endif
+ ;; The result is in %%HASH_IN_OUT
+ jmp %%_after_reduction
+%endif
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; After GHASH reduction
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_small_initial_compute_done:
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If using init/update/finalize, we need to xor any partial block data
+ ;; into the hash.
+%if %%num_initial_blocks > 1
+ ;; NOTE: for %%num_initial_blocks = 0 the xor never takes place
+%if %%num_initial_blocks != 16
+ ;; NOTE: for %%num_initial_blocks = 16, %%LENGTH, stored in [PBlockLen] is never zero
+ or %%LENGTH, %%LENGTH
+ je %%_after_reduction
+%endif ; %%num_initial_blocks != 16
+ vpxorq %%HASH_IN_OUT, %%HASH_IN_OUT, %%T7
+%endif ; %%num_initial_blocks > 1
+%endif ; %%INSTANCE_TYPE, multi_call
+
+%%_after_reduction:
+ ;; Final hash is now in HASH_IN_OUT
+
+%endmacro ; INITIAL_BLOCKS_PARTIAL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main GCM macro stitching cipher with GHASH
+;;; - operates on single stream
+;;; - encrypts 8 blocks at a time
+;;; - ghash the 8 previously encrypted ciphertext blocks
+;;; For partial block case and multi_call , AES_PARTIAL_BLOCK on output
+;;; contains encrypted counter block.
+%macro GHASH_8_ENCRYPT_8_PARALLEL 34-37
+%define %%GDATA %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%CTR1 %5 ; [in/out] ZMM counter blocks 0 to 3
+%define %%CTR2 %6 ; [in/out] ZMM counter blocks 4 to 7
+%define %%GHASHIN_AESOUT_B03 %7 ; [in/out] ZMM ghash in / aes out blocks 0 to 3
+%define %%GHASHIN_AESOUT_B47 %8 ; [in/out] ZMM ghash in / aes out blocks 4 to 7
+%define %%AES_PARTIAL_BLOCK %9 ; [out] XMM partial block (AES)
+%define %%loop_idx %10 ; [in] counter block prep selection "add+shuffle" or "add"
+%define %%ENC_DEC %11 ; [in] cipher direction
+%define %%FULL_PARTIAL %12 ; [in] last block type selection "full" or "partial"
+%define %%IA0 %13 ; [clobbered] temporary GP register
+%define %%IA1 %14 ; [clobbered] temporary GP register
+%define %%LENGTH %15 ; [in] length
+%define %%INSTANCE_TYPE %16 ; [in] 'single_call' or 'multi_call' selection
+%define %%GH4KEY %17 ; [in] ZMM with GHASH keys 4 to 1
+%define %%GH8KEY %18 ; [in] ZMM with GHASH keys 8 to 5
+%define %%SHFMSK %19 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT1 %20 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2 %21 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3 %22 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4 %23 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5 %24 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT10 %25 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT11 %26 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT12 %27 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT13 %28 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT14 %29 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT15 %30 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT16 %31 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT17 %32 ; [clobbered] temporary ZMM (ghash)
+%define %%MASKREG %33 ; [clobbered] mask register for partial loads/stores
+%define %%DO_REDUCTION %34 ; [in] "reduction", "no_reduction", "final_reduction"
+%define %%TO_REDUCE_L %35 ; [in/out] ZMM for low 4x128-bit in case of "no_reduction"
+%define %%TO_REDUCE_H %36 ; [in/out] ZMM for hi 4x128-bit in case of "no_reduction"
+%define %%TO_REDUCE_M %37 ; [in/out] ZMM for medium 4x128-bit in case of "no_reduction"
+
+%define %%GH1H %%ZT10
+%define %%GH1L %%ZT11
+%define %%GH1M1 %%ZT12
+%define %%GH1M2 %%ZT13
+
+%define %%GH2H %%ZT14
+%define %%GH2L %%ZT15
+%define %%GH2M1 %%ZT16
+%define %%GH2M2 %%ZT17
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; populate counter blocks for cipher part
+%ifidn %%loop_idx, in_order
+ ;; %%CTR1 & %%CTR2 are shuffled outside the scope of this macro
+ ;; it has to be kept in unshuffled format
+ vpshufb %%ZT1, %%CTR1, %%SHFMSK
+ vpshufb %%ZT2, %%CTR2, %%SHFMSK
+%else
+ vmovdqa64 %%ZT1, %%CTR1
+ vmovdqa64 %%ZT2, %%CTR2
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; stitch AES rounds with GHASH
+
+%assign aes_round 0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 0 - ARK
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+
+ ;;==================================================
+ ;; GHASH 4 blocks
+ vpclmulqdq %%GH1H, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x11 ; a1*b1
+ vpclmulqdq %%GH1L, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x00 ; a0*b0
+ vpclmulqdq %%GH1M1, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x01 ; a1*b0
+ vpclmulqdq %%GH1M2, %%GHASHIN_AESOUT_B47, %%GH4KEY, 0x10 ; a0*b1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 3 AES rounds
+%rep 3
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; 3 x AES ROUND
+
+ ;; =================================================
+ ;; GHASH 4 blocks
+ vpclmulqdq %%GH2M1, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x10 ; a0*b1
+ vpclmulqdq %%GH2M2, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x01 ; a1*b0
+ vpclmulqdq %%GH2H, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x11 ; a1*b1
+ vpclmulqdq %%GH2L, %%GHASHIN_AESOUT_B03, %%GH8KEY, 0x00 ; a0*b0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 3 AES rounds
+%rep 3
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; 3 x AES ROUND
+
+ ;; =================================================
+ ;; gather GHASH in GH1L (low) and GH1H (high)
+%ifidn %%DO_REDUCTION, no_reduction
+ vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+ vpternlogq %%TO_REDUCE_M, %%GH1M1, %%GH2M2, 0x96 ; TM: TO_REDUCE_M ^= GH1M1 ^ GH2M2
+ vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH: TO_REDUCE_H ^= GH1H ^ GH2H
+ vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL: TO_REDUCE_L ^= GH1L ^ GH2L
+%endif
+%ifidn %%DO_REDUCTION, do_reduction
+ ;; phase 1: add mid products together
+ vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+ vpxorq %%GH1M1, %%GH1M1, %%GH2M2
+
+ vpsrldq %%GH2M1, %%GH1M1, 8
+ vpslldq %%GH1M1, %%GH1M1, 8
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; phase 1: add mid products together
+ vpternlogq %%GH1M1, %%GH1M2, %%GH2M1, 0x96 ; TM: GH1M1 ^= GH1M2 ^ GH2M1
+ vpternlogq %%GH1M1, %%TO_REDUCE_M, %%GH2M2, 0x96 ; TM: GH1M1 ^= TO_REDUCE_M ^ GH2M2
+
+ vpsrldq %%GH2M1, %%GH1M1, 8
+ vpslldq %%GH1M1, %%GH1M1, 8
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2 AES rounds
+%rep 2
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endrep ; 2 x AES ROUND
+
+ ;; =================================================
+ ;; Add mid product to high and low then
+ ;; horizontal xor of low and high 4x128
+%ifidn %%DO_REDUCTION, final_reduction
+ vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64
+ vpxorq %%GH1H, %%TO_REDUCE_H
+ vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64
+ vpxorq %%GH1L, %%TO_REDUCE_L
+%endif
+%ifidn %%DO_REDUCTION, do_reduction
+ vpternlogq %%GH1H, %%GH2H, %%GH2M1, 0x96 ; TH = TH1 + TH2 + TM>>64
+ vpternlogq %%GH1L, %%GH2L, %%GH1M1, 0x96 ; TL = TL1 + TL2 + TM<<64
+%endif
+%ifnidn %%DO_REDUCTION, no_reduction
+ VHPXORI4x128 %%GH1H, %%GH2H
+ VHPXORI4x128 %%GH1L, %%GH2L
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2 AES rounds
+%rep 2
+%if (aes_round < (NROUNDS + 1))
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif ; aes_round < (NROUNDS + 1)
+%endrep
+
+ ;; =================================================
+ ;; first phase of reduction
+%ifnidn %%DO_REDUCTION, no_reduction
+ vmovdqu64 XWORD(%%GH2M2), [rel POLY2]
+ vpclmulqdq XWORD(%%ZT15), XWORD(%%GH2M2), XWORD(%%GH1L), 0x01
+ vpslldq XWORD(%%ZT15), XWORD(%%ZT15), 8 ; shift-L 2 DWs
+ vpxorq XWORD(%%ZT15), XWORD(%%GH1L), XWORD(%%ZT15) ; first phase of the reduct
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; 2 AES rounds
+%rep 2
+%if (aes_round < (NROUNDS + 1))
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif ; aes_round < (NROUNDS + 1)
+%endrep
+
+ ;; =================================================
+ ;; second phase of the reduction
+%ifnidn %%DO_REDUCTION, no_reduction
+ vpclmulqdq XWORD(%%ZT16), XWORD(%%GH2M2), XWORD(%%ZT15), 0x00
+ vpsrldq XWORD(%%ZT16), XWORD(%%ZT16), 4 ; shift-R 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq XWORD(%%ZT13), XWORD(%%GH2M2), XWORD(%%ZT15), 0x10
+ vpslldq XWORD(%%ZT13), XWORD(%%ZT13), 4 ; shift-L 1-DW for result without shifts
+ ;; ZT13 = ZT13 xor ZT16 xor GH1H
+ vpternlogq XWORD(%%ZT13), XWORD(%%ZT16), XWORD(%%GH1H), 0x96
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; all remaining AES rounds but the last
+%rep (NROUNDS + 2)
+%if (aes_round < (NROUNDS + 1))
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+%assign aes_round (aes_round + 1)
+%endif ; aes_round < (NROUNDS + 1)
+%endrep
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; load/store mask (partial case) and load the text data
+%ifidn %%FULL_PARTIAL, full
+ vmovdqu8 %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT5, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+%else
+ lea %%IA0, [rel byte64_len_to_mask_table]
+ mov %%IA1, %%LENGTH
+ sub %%IA1, 64
+ kmovq %%MASKREG, [%%IA0 + 8*%%IA1]
+ vmovdqu8 %%ZT4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vmovdqu8 %%ZT5{%%MASKREG}{z}, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 64]
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; the last AES round (NROUNDS + 1) and XOR against plain/cipher text
+ vbroadcastf64x2 %%ZT3, [%%GDATA + (aes_round * 16)]
+ ZMM_AESENC_ROUND_BLOCKS_0_16 %%ZT1, %%ZT2, no_zmm, no_zmm, \
+ %%ZT3, aes_round, \
+ %%ZT4, %%ZT5, no_zmm, no_zmm, \
+ 8, NROUNDS
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; store the cipher/plain text data
+%ifidn %%FULL_PARTIAL, full
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64], %%ZT2
+%else
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], %%ZT1
+ vmovdqu8 [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 64]{%%MASKREG}, %%ZT2
+%endif
+
+ ;; =================================================
+ ;; prep cipher text blocks for the next ghash round
+
+%ifnidn %%FULL_PARTIAL, full
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; for partial block & multi_call we need encrypted counter block
+ vpxorq %%ZT3, %%ZT2, %%ZT5
+ vextracti32x4 %%AES_PARTIAL_BLOCK, %%ZT3, 3
+%endif
+ ;; for GHASH computation purpose clear the top bytes of the partial block
+%ifidn %%ENC_DEC, ENC
+ vmovdqu8 %%ZT2{%%MASKREG}{z}, %%ZT2
+%else
+ vmovdqu8 %%ZT5{%%MASKREG}{z}, %%ZT5
+%endif
+%endif ; %ifnidn %%FULL_PARTIAL, full
+
+ ;; =================================================
+ ;; shuffle cipher text blocks for GHASH computation
+%ifidn %%ENC_DEC, ENC
+ vpshufb %%GHASHIN_AESOUT_B03, %%ZT1, %%SHFMSK
+ vpshufb %%GHASHIN_AESOUT_B47, %%ZT2, %%SHFMSK
+%else
+ vpshufb %%GHASHIN_AESOUT_B03, %%ZT4, %%SHFMSK
+ vpshufb %%GHASHIN_AESOUT_B47, %%ZT5, %%SHFMSK
+%endif
+
+%ifidn %%DO_REDUCTION, do_reduction
+ ;; =================================================
+ ;; XOR current GHASH value (ZT13) into block 0
+ vpxorq %%GHASHIN_AESOUT_B03, %%ZT13
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; =================================================
+ ;; Return GHASH value (ZT13) in TO_REDUCE_L
+ vmovdqa64 %%TO_REDUCE_L, %%ZT13
+%endif
+
+%endmacro ; GHASH_8_ENCRYPT_8_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Main GCM macro stitching cipher with GHASH
+;;; - operates on single stream
+;;; - encrypts 16 blocks at a time
+;;; - ghash the 16 previously encrypted ciphertext blocks
+;;; - no partial block or multi_call handling here
+%macro GHASH_16_ENCRYPT_16_PARALLEL 42
+%define %%GDATA %1 ; [in] key pointer
+%define %%CYPH_PLAIN_OUT %2 ; [in] pointer to output buffer
+%define %%PLAIN_CYPH_IN %3 ; [in] pointer to input buffer
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%CTR_BE %5 ; [in/out] ZMM counter blocks (last 4) in big-endian
+%define %%CTR_CHECK %6 ; [in/out] GP with 8-bit counter for overflow check
+%define %%HASHKEY_OFFSET %7 ; [in] numerical offset for the highest hash key
+%define %%AESOUT_BLK_OFFSET %8 ; [in] numerical offset for AES-CTR out
+%define %%GHASHIN_BLK_OFFSET %9 ; [in] numerical offset for GHASH blocks in
+%define %%SHFMSK %10 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT1 %11 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT2 %12 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT3 %13 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT4 %14 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT5 %15 ; [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
+%define %%ZT6 %16 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT7 %17 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT8 %18 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT9 %19 ; [clobbered] temporary ZMM (cipher)
+%define %%ZT10 %20 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT11 %21 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT12 %22 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT13 %23 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT14 %24 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT15 %25 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT16 %26 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT17 %27 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT18 %28 ; [clobbered] temporary ZMM (ghash)
+%define %%ZT19 %29 ; [clobbered] temporary ZMM
+%define %%ZT20 %30 ; [clobbered] temporary ZMM
+%define %%ZT21 %31 ; [clobbered] temporary ZMM
+%define %%ZT22 %32 ; [clobbered] temporary ZMM
+%define %%ZT23 %33 ; [clobbered] temporary ZMM
+%define %%ADDBE_4x4 %34 ; [in] ZMM with 4x128bits 4 in big-endian
+%define %%ADDBE_1234 %35 ; [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
+%define %%TO_REDUCE_L %36 ; [in/out] ZMM for low 4x128-bit GHASH sum
+%define %%TO_REDUCE_H %37 ; [in/out] ZMM for hi 4x128-bit GHASH sum
+%define %%TO_REDUCE_M %38 ; [in/out] ZMM for medium 4x128-bit GHASH sum
+%define %%DO_REDUCTION %39 ; [in] "no_reduction", "final_reduction", "first_time"
+%define %%ENC_DEC %40 ; [in] cipher direction
+%define %%DATA_DISPL %41 ; [in] fixed numerical data displacement/offset
+%define %%GHASH_IN %42 ; [in] current GHASH value or "no_ghash_in"
+
+%define %%B00_03 %%ZT1
+%define %%B04_07 %%ZT2
+%define %%B08_11 %%ZT3
+%define %%B12_15 %%ZT4
+
+%define %%GH1H %%ZT5 ; @note: do not change this mapping
+%define %%GH1L %%ZT6
+%define %%GH1M %%ZT7
+%define %%GH1T %%ZT8
+
+%define %%GH2H %%ZT9
+%define %%GH2L %%ZT10
+%define %%GH2M %%ZT11
+%define %%GH2T %%ZT12
+
+%define %%RED_POLY %%GH2T
+%define %%RED_P1 %%GH2L
+%define %%RED_T1 %%GH2H
+%define %%RED_T2 %%GH2M
+
+%define %%GH3H %%ZT13
+%define %%GH3L %%ZT14
+%define %%GH3M %%ZT15
+%define %%GH3T %%ZT16
+
+%define %%DATA1 %%ZT13
+%define %%DATA2 %%ZT14
+%define %%DATA3 %%ZT15
+%define %%DATA4 %%ZT16
+
+%define %%AESKEY1 %%ZT17
+%define %%AESKEY2 %%ZT18
+
+%define %%GHKEY1 %%ZT19
+%define %%GHKEY2 %%ZT20
+%define %%GHDAT1 %%ZT21
+%define %%GHDAT2 %%ZT22
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; prepare counter blocks
+
+ cmp BYTE(%%CTR_CHECK), (256 - 16)
+ jae %%_16_blocks_overflow
+ vpaddd %%B00_03, %%CTR_BE, %%ADDBE_1234
+ vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4
+ vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4
+ vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4
+ jmp %%_16_blocks_ok
+%%_16_blocks_overflow:
+ vpshufb %%CTR_BE, %%CTR_BE, %%SHFMSK
+ vmovdqa64 %%B12_15, [rel ddq_add_4444]
+ vpaddd %%B00_03, %%CTR_BE, [rel ddq_add_1234]
+ vpaddd %%B04_07, %%B00_03, %%B12_15
+ vpaddd %%B08_11, %%B04_07, %%B12_15
+ vpaddd %%B12_15, %%B08_11, %%B12_15
+ vpshufb %%B00_03, %%SHFMSK
+ vpshufb %%B04_07, %%SHFMSK
+ vpshufb %%B08_11, %%SHFMSK
+ vpshufb %%B12_15, %%SHFMSK
+%%_16_blocks_ok:
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; pre-load constants
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 0)]
+%ifnidn %%GHASH_IN, no_ghash_in
+ vpxorq %%GHDAT1, %%GHASH_IN, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)]
+%else
+ vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (0*64)]
+%endif
+ vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (0*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; save counter for the next round
+ ;; increment counter overflow check register
+ vshufi64x2 %%CTR_BE, %%B12_15, %%B12_15, 1111_1111b
+ add BYTE(%%CTR_CHECK), 16
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; pre-load constants
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 1)]
+ vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (1*64)]
+ vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (1*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; stitch AES rounds with GHASH
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 0 - ARK
+
+ vpxorq %%B00_03, %%AESKEY1
+ vpxorq %%B04_07, %%AESKEY1
+ vpxorq %%B08_11, %%AESKEY1
+ vpxorq %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 2)]
+
+ ;;==================================================
+ ;; GHASH 4 blocks (15 to 12)
+ vpclmulqdq %%GH1H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1
+ vpclmulqdq %%GH1L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0
+ vpclmulqdq %%GH1M, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0
+ vpclmulqdq %%GH1T, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1
+
+ vmovdqu64 %%GHKEY1, [%%GDATA + %%HASHKEY_OFFSET + (2*64)]
+ vmovdqa64 %%GHDAT1, [rsp + %%GHASHIN_BLK_OFFSET + (2*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 1
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 3)]
+
+ ;; =================================================
+ ;; GHASH 4 blocks (11 to 8)
+ vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1
+ vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0
+ vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1
+ vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0
+
+ vmovdqu64 %%GHKEY2, [%%GDATA + %%HASHKEY_OFFSET + (3*64)]
+ vmovdqa64 %%GHDAT2, [rsp + %%GHASHIN_BLK_OFFSET + (3*64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 2
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 4)]
+
+ ;; =================================================
+ ;; GHASH 4 blocks (7 to 4)
+ vpclmulqdq %%GH3M, %%GHDAT1, %%GHKEY1, 0x10 ; a0*b1
+ vpclmulqdq %%GH3T, %%GHDAT1, %%GHKEY1, 0x01 ; a1*b0
+ vpclmulqdq %%GH3H, %%GHDAT1, %%GHKEY1, 0x11 ; a1*b1
+ vpclmulqdq %%GH3L, %%GHDAT1, %%GHKEY1, 0x00 ; a0*b0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds 3
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 5)]
+
+ ;; =================================================
+ ;; Gather (XOR) GHASH for 12 blocks
+ vpternlogq %%GH1H, %%GH2H, %%GH3H, 0x96
+ vpternlogq %%GH1L, %%GH2L, %%GH3L, 0x96
+ vpternlogq %%GH1T, %%GH2T, %%GH3T, 0x96
+ vpternlogq %%GH1M, %%GH2M, %%GH3M, 0x96
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds 4
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 6)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; load plain/cipher text (recycle GH3xx registers)
+ VX512LDR %%DATA1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)]
+ VX512LDR %%DATA2, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)]
+ VX512LDR %%DATA3, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)]
+ VX512LDR %%DATA4, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)]
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds 5
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 7)]
+
+ ;; =================================================
+ ;; GHASH 4 blocks (3 to 0)
+ vpclmulqdq %%GH2M, %%GHDAT2, %%GHKEY2, 0x10 ; a0*b1
+ vpclmulqdq %%GH2T, %%GHDAT2, %%GHKEY2, 0x01 ; a1*b0
+ vpclmulqdq %%GH2H, %%GHDAT2, %%GHKEY2, 0x11 ; a1*b1
+ vpclmulqdq %%GH2L, %%GHDAT2, %%GHKEY2, 0x00 ; a0*b0
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 6
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 8)]
+
+ ;; =================================================
+ ;; gather GHASH in GH1L (low) and GH1H (high)
+%ifidn %%DO_REDUCTION, first_time
+ vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+ vpxorq %%TO_REDUCE_M, %%GH1M, %%GH2M ; TM
+ vpxorq %%TO_REDUCE_H, %%GH1H, %%GH2H ; TH
+ vpxorq %%TO_REDUCE_L, %%GH1L, %%GH2L ; TL
+%endif
+%ifidn %%DO_REDUCTION, no_reduction
+ vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+ vpternlogq %%TO_REDUCE_M, %%GH1M, %%GH2M, 0x96 ; TM
+ vpternlogq %%TO_REDUCE_H, %%GH1H, %%GH2H, 0x96 ; TH
+ vpternlogq %%TO_REDUCE_L, %%GH1L, %%GH2L, 0x96 ; TL
+%endif
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; phase 1: add mid products together
+ ;; also load polynomial constant for reduction
+ vpternlogq %%GH1M, %%GH1T, %%GH2T, 0x96 ; TM
+ vpternlogq %%GH1M, %%TO_REDUCE_M, %%GH2M, 0x96
+
+ vpsrldq %%GH2M, %%GH1M, 8
+ vpslldq %%GH1M, %%GH1M, 8
+
+ vmovdqa64 XWORD(%%RED_POLY), [rel POLY2]
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 7
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 9)]
+
+ ;; =================================================
+ ;; Add mid product to high and low
+%ifidn %%DO_REDUCTION, final_reduction
+ vpternlogq %%GH1H, %%GH2H, %%GH2M, 0x96 ; TH = TH1 + TH2 + TM>>64
+ vpxorq %%GH1H, %%TO_REDUCE_H
+ vpternlogq %%GH1L, %%GH2L, %%GH1M, 0x96 ; TL = TL1 + TL2 + TM<<64
+ vpxorq %%GH1L, %%TO_REDUCE_L
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 8
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 10)]
+
+ ;; =================================================
+ ;; horizontal xor of low and high 4x128
+%ifidn %%DO_REDUCTION, final_reduction
+ VHPXORI4x128 %%GH1H, %%GH2H
+ VHPXORI4x128 %%GH1L, %%GH2L
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES round 9
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+%if (NROUNDS >= 11)
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 11)]
+%endif
+ ;; =================================================
+ ;; first phase of reduction
+%ifidn %%DO_REDUCTION, final_reduction
+ vpclmulqdq XWORD(%%RED_P1), XWORD(%%RED_POLY), XWORD(%%GH1L), 0x01
+ vpslldq XWORD(%%RED_P1), XWORD(%%RED_P1), 8 ; shift-L 2 DWs
+ vpxorq XWORD(%%RED_P1), XWORD(%%GH1L), XWORD(%%RED_P1) ; first phase of the reduct
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; AES rounds up to 11 (AES192) or 13 (AES256)
+ ;; AES128 is done
+%if (NROUNDS >= 11)
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 12)]
+
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+%if (NROUNDS == 13)
+ vbroadcastf64x2 %%AESKEY2, [%%GDATA + (16 * 13)]
+
+ vaesenc %%B00_03, %%B00_03, %%AESKEY1
+ vaesenc %%B04_07, %%B04_07, %%AESKEY1
+ vaesenc %%B08_11, %%B08_11, %%AESKEY1
+ vaesenc %%B12_15, %%B12_15, %%AESKEY1
+ vbroadcastf64x2 %%AESKEY1, [%%GDATA + (16 * 14)]
+
+ vaesenc %%B00_03, %%B00_03, %%AESKEY2
+ vaesenc %%B04_07, %%B04_07, %%AESKEY2
+ vaesenc %%B08_11, %%B08_11, %%AESKEY2
+ vaesenc %%B12_15, %%B12_15, %%AESKEY2
+%endif ; GCM256 / NROUNDS = 13 (15 including the first and the last)
+%endif ; GCM192 / NROUNDS = 11 (13 including the first and the last)
+
+ ;; =================================================
+ ;; second phase of the reduction
+%ifidn %%DO_REDUCTION, final_reduction
+ vpclmulqdq XWORD(%%RED_T1), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x00
+ vpsrldq XWORD(%%RED_T1), XWORD(%%RED_T1), 4 ; shift-R 1-DW to obtain 2-DWs shift-R
+
+ vpclmulqdq XWORD(%%RED_T2), XWORD(%%RED_POLY), XWORD(%%RED_P1), 0x10
+ vpslldq XWORD(%%RED_T2), XWORD(%%RED_T2), 4 ; shift-L 1-DW for result without shifts
+ ;; GH1H = GH1H x RED_T1 x RED_T2
+ vpternlogq XWORD(%%GH1H), XWORD(%%RED_T2), XWORD(%%RED_T1), 0x96
+%endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; the last AES round
+ vaesenclast %%B00_03, %%B00_03, %%AESKEY1
+ vaesenclast %%B04_07, %%B04_07, %%AESKEY1
+ vaesenclast %%B08_11, %%B08_11, %%AESKEY1
+ vaesenclast %%B12_15, %%B12_15, %%AESKEY1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; XOR against plain/cipher text
+ vpxorq %%B00_03, %%B00_03, %%DATA1
+ vpxorq %%B04_07, %%B04_07, %%DATA2
+ vpxorq %%B08_11, %%B08_11, %%DATA3
+ vpxorq %%B12_15, %%B12_15, %%DATA4
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; store cipher/plain text
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (0 * 64)], %%B00_03
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (1 * 64)], %%B04_07
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (2 * 64)], %%B08_11
+ VX512STR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + %%DATA_DISPL + (3 * 64)], %%B12_15
+
+ ;; =================================================
+ ;; shuffle cipher text blocks for GHASH computation
+%ifidn %%ENC_DEC, ENC
+ vpshufb %%B00_03, %%B00_03, %%SHFMSK
+ vpshufb %%B04_07, %%B04_07, %%SHFMSK
+ vpshufb %%B08_11, %%B08_11, %%SHFMSK
+ vpshufb %%B12_15, %%B12_15, %%SHFMSK
+%else
+ vpshufb %%B00_03, %%DATA1, %%SHFMSK
+ vpshufb %%B04_07, %%DATA2, %%SHFMSK
+ vpshufb %%B08_11, %%DATA3, %%SHFMSK
+ vpshufb %%B12_15, %%DATA4, %%SHFMSK
+%endif
+
+ ;; =================================================
+ ;; store shuffled cipher text for ghashing
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (0*64)], %%B00_03
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (1*64)], %%B04_07
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (2*64)], %%B08_11
+ vmovdqa64 [rsp + %%AESOUT_BLK_OFFSET + (3*64)], %%B12_15
+
+%ifidn %%DO_REDUCTION, final_reduction
+ ;; =================================================
+ ;; Return GHASH value through %%GH1H
+%endif
+
+%endmacro ; GHASH_16_ENCRYPT_16_PARALLEL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH the last 8 ciphertext blocks.
+;;; - optionally accepts GHASH product sums as input
+%macro GHASH_LAST_8 10-13
+%define %%GDATA %1 ; [in] key pointer
+%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7
+%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3
+%define %%ZTH %4 ; [cloberred] ZMM temporary
+%define %%ZTM %5 ; [cloberred] ZMM temporary
+%define %%ZTL %6 ; [cloberred] ZMM temporary
+%define %%ZT01 %7 ; [cloberred] ZMM temporary
+%define %%ZT02 %8 ; [cloberred] ZMM temporary
+%define %%ZT03 %9 ; [cloberred] ZMM temporary
+%define %%AAD_HASH %10 ; [out] XMM hash value
+%define %%GH %11 ; [in/optional] ZMM with GHASH high product sum
+%define %%GL %12 ; [in/optional] ZMM with GHASH low product sum
+%define %%GM %13 ; [in/optional] ZMM with GHASH mid product sum
+
+ VCLMUL_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZTH, %%ZTM, %%ZTL
+
+%if %0 > 10
+ ;; add optional sums before step2
+ vpxorq %%ZTH, %%ZTH, %%GH
+ vpxorq %%ZTL, %%ZTL, %%GL
+ vpxorq %%ZTM, %%ZTM, %%GM
+%endif
+
+ VCLMUL_STEP2 %%GDATA, %%BL47, %%BL03, %%ZT01, %%ZT02, %%ZT03, %%ZTH, %%ZTM, %%ZTL
+
+ vmovdqa64 XWORD(%%ZT03), [rel POLY2]
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT03), XWORD(%%BL47), XWORD(%%BL03), \
+ XWORD(%%ZT01), XWORD(%%ZT02)
+%endmacro ; GHASH_LAST_8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GHASH the last 7 cipher text blocks.
+;;; - it uses same GHASH macros as GHASH_LAST_8 but with some twist
+;;; - it loads GHASH keys for each of the data blocks, so that:
+;;; - blocks 4, 5 and 6 will use GHASH keys 3, 2, 1 respectively
+;;; - code ensures that unused block 7 and corresponding GHASH key are zeroed
+;;; (clmul product is zero this way and will not affect the result)
+;;; - blocks 0, 1, 2 and 3 will use USE GHASH keys 7, 6, 5 and 4 respectively
+;;; - optionally accepts GHASH product sums as input
+%macro GHASH_LAST_7 13-16
+%define %%GDATA %1 ; [in] key pointer
+%define %%BL47 %2 ; [in/clobbered] ZMM AES blocks 4 to 7
+%define %%BL03 %3 ; [in/cloberred] ZMM AES blocks 0 to 3
+%define %%ZTH %4 ; [cloberred] ZMM temporary
+%define %%ZTM %5 ; [cloberred] ZMM temporary
+%define %%ZTL %6 ; [cloberred] ZMM temporary
+%define %%ZT01 %7 ; [cloberred] ZMM temporary
+%define %%ZT02 %8 ; [cloberred] ZMM temporary
+%define %%ZT03 %9 ; [cloberred] ZMM temporary
+%define %%ZT04 %10 ; [cloberred] ZMM temporary
+%define %%AAD_HASH %11 ; [out] XMM hash value
+%define %%MASKREG %12 ; [clobbered] mask register to use for loads
+%define %%IA0 %13 ; [clobbered] GP temporary register
+%define %%GH %14 ; [in/optional] ZMM with GHASH high product sum
+%define %%GL %15 ; [in/optional] ZMM with GHASH low product sum
+%define %%GM %16 ; [in/optional] ZMM with GHASH mid product sum
+
+ vmovdqa64 XWORD(%%ZT04), [rel POLY2]
+
+ VCLMUL_1_TO_8_STEP1 %%GDATA, %%BL47, %%ZT01, %%ZT02, %%ZTH, %%ZTM, %%ZTL, 7
+
+%if %0 > 13
+ ;; add optional sums before step2
+ vpxorq %%ZTH, %%ZTH, %%GH
+ vpxorq %%ZTL, %%ZTL, %%GL
+ vpxorq %%ZTM, %%ZTM, %%GM
+%endif
+
+ VCLMUL_1_TO_8_STEP2 %%GDATA, %%BL47, %%BL03, \
+ %%ZT01, %%ZT02, %%ZT03, \
+ %%ZTH, %%ZTM, %%ZTL, 7
+
+ VCLMUL_REDUCE %%AAD_HASH, XWORD(%%ZT04), XWORD(%%BL47), XWORD(%%BL03), \
+ XWORD(%%ZT01), XWORD(%%ZT02)
+%endmacro ; GHASH_LAST_7
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxorq %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep NROUNDS
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i]
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Save register content for the caller
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ mov rax, rsp
+
+ sub rsp, STACK_FRAME_SIZE
+ and rsp, ~63
+
+ mov [rsp + STACK_GP_OFFSET + 0*8], r12
+ mov [rsp + STACK_GP_OFFSET + 1*8], r13
+ mov [rsp + STACK_GP_OFFSET + 2*8], r14
+ mov [rsp + STACK_GP_OFFSET + 3*8], r15
+ mov [rsp + STACK_GP_OFFSET + 4*8], rax ; stack
+ mov r14, rax ; r14 is used to retrieve stack args
+ mov [rsp + STACK_GP_OFFSET + 5*8], rbp
+ mov [rsp + STACK_GP_OFFSET + 6*8], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + STACK_GP_OFFSET + 7*8], rdi
+ mov [rsp + STACK_GP_OFFSET + 8*8], rsi
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + STACK_XMM_OFFSET + 0*16], xmm6
+ vmovdqu [rsp + STACK_XMM_OFFSET + 1*16], xmm7
+ vmovdqu [rsp + STACK_XMM_OFFSET + 2*16], xmm8
+ vmovdqu [rsp + STACK_XMM_OFFSET + 3*16], xmm9
+ vmovdqu [rsp + STACK_XMM_OFFSET + 4*16], xmm10
+ vmovdqu [rsp + STACK_XMM_OFFSET + 5*16], xmm11
+ vmovdqu [rsp + STACK_XMM_OFFSET + 6*16], xmm12
+ vmovdqu [rsp + STACK_XMM_OFFSET + 7*16], xmm13
+ vmovdqu [rsp + STACK_XMM_OFFSET + 8*16], xmm14
+ vmovdqu [rsp + STACK_XMM_OFFSET + 9*16], xmm15
+%endif
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Restore register content for the caller
+%macro FUNC_RESTORE 0
+
+%ifdef SAFE_DATA
+ clear_scratch_gps_asm
+ clear_scratch_zmms_asm
+%else
+ vzeroupper
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15, [rsp + STACK_XMM_OFFSET + 9*16]
+ vmovdqu xmm14, [rsp + STACK_XMM_OFFSET + 8*16]
+ vmovdqu xmm13, [rsp + STACK_XMM_OFFSET + 7*16]
+ vmovdqu xmm12, [rsp + STACK_XMM_OFFSET + 6*16]
+ vmovdqu xmm11, [rsp + STACK_XMM_OFFSET + 5*16]
+ vmovdqu xmm10, [rsp + STACK_XMM_OFFSET + 4*16]
+ vmovdqu xmm9, [rsp + STACK_XMM_OFFSET + 3*16]
+ vmovdqu xmm8, [rsp + STACK_XMM_OFFSET + 2*16]
+ vmovdqu xmm7, [rsp + STACK_XMM_OFFSET + 1*16]
+ vmovdqu xmm6, [rsp + STACK_XMM_OFFSET + 0*16]
+%endif
+
+ ;; Required for Update/GMC_ENC
+ mov rbp, [rsp + STACK_GP_OFFSET + 5*8]
+ mov rbx, [rsp + STACK_GP_OFFSET + 6*8]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [rsp + STACK_GP_OFFSET + 7*8]
+ mov rsi, [rsp + STACK_GP_OFFSET + 8*8]
+%endif
+ mov r12, [rsp + STACK_GP_OFFSET + 0*8]
+ mov r13, [rsp + STACK_GP_OFFSET + 1*8]
+ mov r14, [rsp + STACK_GP_OFFSET + 2*8]
+ mov r15, [rsp + STACK_GP_OFFSET + 3*8]
+ mov rsp, [rsp + STACK_GP_OFFSET + 4*8] ; stack
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding.
+;;; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV,
+;;; Additional Authentication data (A_IN), Additional Data length (A_LEN).
+;;; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA_CTX.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 21
+%define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer
+%define %%GDATA_CTX %2 ; [in] GCM context pointer
+%define %%IV %3 ; [in] IV pointer
+%define %%A_IN %4 ; [in] AAD pointer
+%define %%A_LEN %5 ; [in] AAD length in bytes
+%define %%GPR1 %6 ; [clobbered] GP register
+%define %%GPR2 %7 ; [clobbered] GP register
+%define %%GPR3 %8 ; [clobbered] GP register
+%define %%MASKREG %9 ; [clobbered] mask register
+%define %%AAD_HASH %10 ; [out] XMM for AAD_HASH value (xmm14)
+%define %%CUR_COUNT %11 ; [out] XMM with current counter (xmm2)
+%define %%ZT0 %12 ; [clobbered] ZMM register
+%define %%ZT1 %13 ; [clobbered] ZMM register
+%define %%ZT2 %14 ; [clobbered] ZMM register
+%define %%ZT3 %15 ; [clobbered] ZMM register
+%define %%ZT4 %16 ; [clobbered] ZMM register
+%define %%ZT5 %17 ; [clobbered] ZMM register
+%define %%ZT6 %18 ; [clobbered] ZMM register
+%define %%ZT7 %19 ; [clobbered] ZMM register
+%define %%ZT8 %20 ; [clobbered] ZMM register
+%define %%ZT9 %21 ; [clobbered] ZMM register
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \
+ %%GPR1, %%GPR2, %%GPR3, %%MASKREG
+
+ mov %%GPR1, %%A_LEN
+ vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx.aad hash = aad_hash
+ mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx.aad_length = aad_length
+
+ xor %%GPR1, %%GPR1
+ mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx.in_length = 0
+ mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx.partial_block_length = 0
+
+ ;; read 12 IV bytes and pad with 0x00000001
+ vmovdqu8 %%CUR_COUNT, [rel ONEf]
+ mov %%GPR2, %%IV
+ mov %%GPR1, 0x0000_0000_0000_0fff
+ kmovq %%MASKREG, %%GPR1
+ vmovdqu8 %%CUR_COUNT{%%MASKREG}, [%%GPR2] ; ctr = IV | 0x1
+
+ vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv
+
+ ;; store IV as counter in LE format
+ vpshufb %%CUR_COUNT, [rel SHUF_MASK]
+ vmovdqu [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Cipher and ghash of payloads shorter than 256 bytes
+;;; - number of blocks in the message comes as argument
+;;; - depending on the number of blocks an optimized variant of
+;;; INITIAL_BLOCKS_PARTIAL is invoked
+%macro GCM_ENC_DEC_SMALL 42
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer
+%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length
+%define %%ENC_DEC %6 ; [in] cipher direction
+%define %%DATA_OFFSET %7 ; [in] data offset
+%define %%LENGTH %8 ; [in] data length
+%define %%NUM_BLOCKS %9 ; [in] number of blocks to process 1 to 16
+%define %%CTR %10 ; [in/out] XMM counter block
+%define %%HASH_IN_OUT %11 ; [in/out] XMM GHASH value
+%define %%INSTANCE_TYPE %12 ; [in] single or multi call
+%define %%ZTMP0 %13 ; [clobbered] ZMM register
+%define %%ZTMP1 %14 ; [clobbered] ZMM register
+%define %%ZTMP2 %15 ; [clobbered] ZMM register
+%define %%ZTMP3 %16 ; [clobbered] ZMM register
+%define %%ZTMP4 %17 ; [clobbered] ZMM register
+%define %%ZTMP5 %18 ; [clobbered] ZMM register
+%define %%ZTMP6 %19 ; [clobbered] ZMM register
+%define %%ZTMP7 %20 ; [clobbered] ZMM register
+%define %%ZTMP8 %21 ; [clobbered] ZMM register
+%define %%ZTMP9 %22 ; [clobbered] ZMM register
+%define %%ZTMP10 %23 ; [clobbered] ZMM register
+%define %%ZTMP11 %24 ; [clobbered] ZMM register
+%define %%ZTMP12 %25 ; [clobbered] ZMM register
+%define %%ZTMP13 %26 ; [clobbered] ZMM register
+%define %%ZTMP14 %27 ; [clobbered] ZMM register
+%define %%ZTMP15 %28 ; [clobbered] ZMM register
+%define %%ZTMP16 %29 ; [clobbered] ZMM register
+%define %%ZTMP17 %30 ; [clobbered] ZMM register
+%define %%ZTMP18 %31 ; [clobbered] ZMM register
+%define %%ZTMP19 %32 ; [clobbered] ZMM register
+%define %%ZTMP20 %33 ; [clobbered] ZMM register
+%define %%ZTMP21 %34 ; [clobbered] ZMM register
+%define %%ZTMP22 %35 ; [clobbered] ZMM register
+%define %%GH %36 ; [in] ZMM ghash sum (high)
+%define %%GL %37 ; [in] ZMM ghash sum (low)
+%define %%GM %38 ; [in] ZMM ghash sum (middle)
+%define %%IA0 %39 ; [clobbered] GP register
+%define %%IA1 %40 ; [clobbered] GP register
+%define %%MASKREG %41 ; [clobbered] mask register
+%define %%SHUFMASK %42 ; [in] ZMM with BE/LE shuffle mask
+
+ cmp %%NUM_BLOCKS, 8
+ je %%_small_initial_num_blocks_is_8
+ jl %%_small_initial_num_blocks_is_7_1
+
+
+ cmp %%NUM_BLOCKS, 12
+ je %%_small_initial_num_blocks_is_12
+ jl %%_small_initial_num_blocks_is_11_9
+
+ ;; 16, 15, 14 or 13
+ cmp %%NUM_BLOCKS, 16
+ je %%_small_initial_num_blocks_is_16
+ cmp %%NUM_BLOCKS, 15
+ je %%_small_initial_num_blocks_is_15
+ cmp %%NUM_BLOCKS, 14
+ je %%_small_initial_num_blocks_is_14
+ jmp %%_small_initial_num_blocks_is_13
+
+%%_small_initial_num_blocks_is_11_9:
+ ;; 11, 10 or 9
+ cmp %%NUM_BLOCKS, 11
+ je %%_small_initial_num_blocks_is_11
+ cmp %%NUM_BLOCKS, 10
+ je %%_small_initial_num_blocks_is_10
+ jmp %%_small_initial_num_blocks_is_9
+
+%%_small_initial_num_blocks_is_7_1:
+ cmp %%NUM_BLOCKS, 4
+ je %%_small_initial_num_blocks_is_4
+ jl %%_small_initial_num_blocks_is_3_1
+ ;; 7, 6 or 5
+ cmp %%NUM_BLOCKS, 7
+ je %%_small_initial_num_blocks_is_7
+ cmp %%NUM_BLOCKS, 6
+ je %%_small_initial_num_blocks_is_6
+ jmp %%_small_initial_num_blocks_is_5
+
+%%_small_initial_num_blocks_is_3_1:
+ ;; 3, 2 or 1
+ cmp %%NUM_BLOCKS, 3
+ je %%_small_initial_num_blocks_is_3
+ cmp %%NUM_BLOCKS, 2
+ je %%_small_initial_num_blocks_is_2
+
+ ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed
+
+ ;; Use rep to generate different block size variants
+ ;; - one block size has to be the first one
+%assign num_blocks 1
+%rep 16
+%%_small_initial_num_blocks_is_ %+ num_blocks :
+ INITIAL_BLOCKS_PARTIAL %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, \
+ %%PLAIN_CYPH_IN, %%LENGTH, %%DATA_OFFSET, num_blocks, \
+ %%CTR, %%HASH_IN_OUT, %%ENC_DEC, %%INSTANCE_TYPE, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, \
+ %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \
+ %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%IA0, %%IA1, %%MASKREG, %%SHUFMASK
+%if num_blocks != 16
+ jmp %%_small_initial_blocks_encrypted
+%endif
+%assign num_blocks (num_blocks + 1)
+%endrep
+
+%%_small_initial_blocks_encrypted:
+
+%endmacro ; GCM_ENC_DEC_SMALL
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct
+; has been initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CYPH_IN),
+; input text length (PLAIN_CYPH_LEN) and whether encoding or decoding (ENC_DEC).
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA_CTX
+; Clobbers rax, r10-r15, and zmm0-zmm31, k1
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 7
+%define %%GDATA_KEY %1 ; [in] key pointer
+%define %%GDATA_CTX %2 ; [in] context pointer
+%define %%CYPH_PLAIN_OUT %3 ; [in] output buffer pointer
+%define %%PLAIN_CYPH_IN %4 ; [in] input buffer pointer
+%define %%PLAIN_CYPH_LEN %5 ; [in] buffer length
+%define %%ENC_DEC %6 ; [in] cipher direction
+%define %%INSTANCE_TYPE %7 ; [in] 'single_call' or 'multi_call' selection
+
+%define %%IA0 r10
+%define %%IA1 r12
+%define %%IA2 r13
+%define %%IA3 r15
+%define %%IA4 r11
+%define %%IA5 rax
+
+%define %%LENGTH %%IA2
+%define %%CTR_CHECK %%IA3
+%define %%DATA_OFFSET %%IA4
+
+%define %%HASHK_PTR %%IA5
+
+%define %%GCM_INIT_CTR_BLOCK xmm2 ; hardcoded in GCM_INIT for now
+
+%define %%AES_PARTIAL_BLOCK xmm8
+%define %%CTR_BLOCK2z zmm18
+%define %%CTR_BLOCKz zmm9
+%define %%CTR_BLOCKx xmm9
+%define %%AAD_HASHz zmm14
+%define %%AAD_HASHx xmm14
+
+;;; ZTMP0 - ZTMP12 - used in by8 code, by128/48 code and GCM_ENC_DEC_SMALL
+%define %%ZTMP0 zmm0
+%define %%ZTMP1 zmm3
+%define %%ZTMP2 zmm4
+%define %%ZTMP3 zmm5
+%define %%ZTMP4 zmm6
+%define %%ZTMP5 zmm7
+%define %%ZTMP6 zmm10
+%define %%ZTMP7 zmm11
+%define %%ZTMP8 zmm12
+%define %%ZTMP9 zmm13
+%define %%ZTMP10 zmm15
+%define %%ZTMP11 zmm16
+%define %%ZTMP12 zmm17
+
+;;; ZTMP13 - ZTMP22 - used in by128/48 code and GCM_ENC_DEC_SMALL
+;;; - some used by8 code as well through TMPxy names
+%define %%ZTMP13 zmm19
+%define %%ZTMP14 zmm20
+%define %%ZTMP15 zmm21
+%define %%ZTMP16 zmm30 ; can be used in very/big_loop part
+%define %%ZTMP17 zmm31 ; can be used in very/big_loop part
+%define %%ZTMP18 zmm1
+%define %%ZTMP19 zmm2
+%define %%ZTMP20 zmm8
+%define %%ZTMP21 zmm22
+%define %%ZTMP22 zmm23
+
+;;; Free to use: zmm24 - zmm29
+;;; - used by by128/48 and by8
+%define %%GH zmm24
+%define %%GL zmm25
+%define %%GM zmm26
+%define %%SHUF_MASK zmm29
+%define %%CTR_BLOCK_SAVE zmm28
+
+;;; - used by by128/48 code only
+%define %%ADDBE_4x4 zmm27
+%define %%ADDBE_1234 zmm28 ; conflicts with CTR_BLOCK_SAVE
+
+;; used by8 code only
+%define %%GH4KEY %%ZTMP17
+%define %%GH8KEY %%ZTMP16
+%define %%BLK0 %%ZTMP18
+%define %%BLK1 %%ZTMP19
+%define %%ADD8BE zmm27
+%define %%ADD8LE %%ZTMP13
+
+%define %%MASKREG k1
+
+%ifdef GCM_BIG_DATA
+;; reduction every 128 blocks, depth 32 blocks
+;; @note 128 blocks is the maximum capacity of the stack frame when
+;; GCM_BIG_DATA is defined
+%assign very_big_loop_nblocks 128
+%assign very_big_loop_depth 32
+%endif
+
+;; reduction every 48 blocks, depth 32 blocks
+;; @note 48 blocks is the maximum capacity of the stack frame when
+;; GCM_BIG_DATA is not defined
+%assign big_loop_nblocks 48
+%assign big_loop_depth 32
+
+;;; Macro flow:
+;;; - for message size bigger than very_big_loop_nblocks process data
+;;; with "very_big_loop" parameters
+;;; - for message size bigger than big_loop_nblocks process data
+;;; with "big_loop" parameters
+;;; - calculate the number of 16byte blocks in the message
+;;; - process (number of 16byte blocks) mod 8
+;;; '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+;;; - process 8 16 byte blocks at a time until all are done in %%_encrypt_by_8_new
+
+%ifidn __OUTPUT_FORMAT__, win64
+ cmp %%PLAIN_CYPH_LEN, 0
+%else
+ or %%PLAIN_CYPH_LEN, %%PLAIN_CYPH_LEN
+%endif
+ je %%_enc_dec_done
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+
+ ;; Update length of data processed
+%ifidn __OUTPUT_FORMAT__, win64
+ mov %%IA0, %%PLAIN_CYPH_LEN
+ add [%%GDATA_CTX + InLen], %%IA0
+%else
+ add [%%GDATA_CTX + InLen], %%PLAIN_CYPH_LEN
+%endif
+ vmovdqu64 %%AAD_HASHx, [%%GDATA_CTX + AadHash]
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: partial block processing makes only sense for multi_call here.
+ ;; Used for the update flow - if there was a previous partial
+ ;; block fill the remaining bytes here.
+ PARTIAL_BLOCK %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%PLAIN_CYPH_LEN, %%DATA_OFFSET, %%AAD_HASHx, %%ENC_DEC, \
+ %%IA0, %%IA1, %%IA2, %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%MASKREG
+%endif
+
+ ;; lift counter block from GCM_INIT to here
+%ifidn %%INSTANCE_TYPE, single_call
+ vmovdqu64 %%CTR_BLOCKx, %%GCM_INIT_CTR_BLOCK
+%else
+ vmovdqu64 %%CTR_BLOCKx, [%%GDATA_CTX + CurCount]
+%endif
+
+ ;; Save the amount of data left to process in %%LENGTH
+ mov %%LENGTH, %%PLAIN_CYPH_LEN
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; NOTE: %%DATA_OFFSET is zero in single_call case.
+ ;; Consequently PLAIN_CYPH_LEN will never be zero after
+ ;; %%DATA_OFFSET subtraction below.
+ ;; There may be no more data if it was consumed in the partial block.
+ sub %%LENGTH, %%DATA_OFFSET
+ je %%_enc_dec_done
+%endif ; %%INSTANCE_TYPE, multi_call
+
+ vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK]
+ vmovdqa64 %%ADDBE_4x4, [rel ddq_addbe_4444]
+
+%ifdef GCM_BIG_DATA
+ vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234]
+
+ cmp %%LENGTH, (very_big_loop_nblocks * 16)
+ jl %%_message_below_very_big_nblocks
+
+ INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%SHUF_MASK, %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth
+
+ sub %%LENGTH, (very_big_loop_nblocks * 16)
+ cmp %%LENGTH, (very_big_loop_nblocks * 16)
+ jl %%_no_more_very_big_nblocks
+
+%%_encrypt_very_big_nblocks:
+ GHASH_ENCRYPT_Nx16_PARALLEL \
+ %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%CTR_BLOCKz, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \
+ %%ENC_DEC, very_big_loop_nblocks, very_big_loop_depth, %%CTR_CHECK
+
+ sub %%LENGTH, (very_big_loop_nblocks * 16)
+ cmp %%LENGTH, (very_big_loop_nblocks * 16)
+ jge %%_encrypt_very_big_nblocks
+
+%%_no_more_very_big_nblocks:
+ vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK)
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%GH, %%GL, %%GM, very_big_loop_nblocks, very_big_loop_depth
+
+ or %%LENGTH, %%LENGTH
+ jz %%_ghash_done
+
+%%_message_below_very_big_nblocks:
+%endif ; GCM_BIG_DATA
+
+ cmp %%LENGTH, (big_loop_nblocks * 16)
+ jl %%_message_below_big_nblocks
+
+ ;; overwritten above by CTR_BLOCK_SAVE
+ vmovdqa64 %%ADDBE_1234, [rel ddq_addbe_1234]
+
+ INITIAL_BLOCKS_Nx16 %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%AAD_HASHz, %%CTR_BLOCKz, %%CTR_CHECK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%SHUF_MASK, %%ENC_DEC, big_loop_nblocks, big_loop_depth
+
+ sub %%LENGTH, (big_loop_nblocks * 16)
+ cmp %%LENGTH, (big_loop_nblocks * 16)
+ jl %%_no_more_big_nblocks
+
+%%_encrypt_big_nblocks:
+ GHASH_ENCRYPT_Nx16_PARALLEL \
+ %%PLAIN_CYPH_IN, %%CYPH_PLAIN_OUT, %%GDATA_KEY, %%DATA_OFFSET, \
+ %%CTR_BLOCKz, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ %%GH, %%GL, %%GM, \
+ %%ADDBE_4x4, %%ADDBE_1234, %%AAD_HASHz, \
+ %%ENC_DEC, big_loop_nblocks, big_loop_depth, %%CTR_CHECK
+
+ sub %%LENGTH, (big_loop_nblocks * 16)
+ cmp %%LENGTH, (big_loop_nblocks * 16)
+ jge %%_encrypt_big_nblocks
+
+%%_no_more_big_nblocks:
+ vpshufb %%CTR_BLOCKx, XWORD(%%SHUF_MASK)
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ GHASH_LAST_Nx16 %%GDATA_KEY, %%AAD_HASHz, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%GH, %%GL, %%GM, big_loop_nblocks, big_loop_depth
+
+ or %%LENGTH, %%LENGTH
+ jz %%_ghash_done
+
+%%_message_below_big_nblocks:
+
+ ;; Less than 256 bytes will be handled by the small message code, which
+ ;; can process up to 16 x blocks (16 bytes each)
+ cmp %%LENGTH, (16 * 16)
+ jge %%_large_message_path
+
+ ;; Determine how many blocks to process
+ ;; - process one additional block if there is a partial block
+ mov %%IA1, %%LENGTH
+ add %%IA1, 15
+ shr %%IA1, 4
+ ;; %%IA1 can be in the range from 0 to 16
+
+ GCM_ENC_DEC_SMALL \
+ %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%PLAIN_CYPH_LEN, %%ENC_DEC, %%DATA_OFFSET, \
+ %%LENGTH, %%IA1, %%CTR_BLOCKx, %%AAD_HASHx, %%INSTANCE_TYPE, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, \
+ %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \
+ %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \
+ %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \
+ %%ZTMP20, %%ZTMP21, %%ZTMP22, \
+ no_zmm, no_zmm, no_zmm, \
+ %%IA0, %%IA3, %%MASKREG, %%SHUF_MASK
+
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ jmp %%_ghash_done
+
+%%_large_message_path:
+ ;; Determine how many blocks to process in INITIAL
+ ;; - process one additional block in INITIAL if there is a partial block
+ mov %%IA1, %%LENGTH
+ and %%IA1, 0xff
+ add %%IA1, 15
+ shr %%IA1, 4
+ ;; Don't allow 8 INITIAL blocks since this will
+ ;; be handled by the x8 partial loop.
+ and %%IA1, 7
+ je %%_initial_num_blocks_is_0
+ cmp %%IA1, 1
+ je %%_initial_num_blocks_is_1
+ cmp %%IA1, 2
+ je %%_initial_num_blocks_is_2
+ cmp %%IA1, 3
+ je %%_initial_num_blocks_is_3
+ cmp %%IA1, 4
+ je %%_initial_num_blocks_is_4
+ cmp %%IA1, 5
+ je %%_initial_num_blocks_is_5
+ cmp %%IA1, 6
+ je %%_initial_num_blocks_is_6
+
+%assign number_of_blocks 7
+%rep 8
+%%_initial_num_blocks_is_ %+ number_of_blocks:
+ INITIAL_BLOCKS %%GDATA_KEY, %%GDATA_CTX, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%LENGTH, %%DATA_OFFSET, number_of_blocks, %%CTR_BLOCKx, %%AAD_HASHz, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, \
+ %%ZTMP5, %%ZTMP6, %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, \
+ %%IA0, %%IA1, %%ENC_DEC, %%MASKREG, %%SHUF_MASK, no_partial_block
+%if number_of_blocks != 0
+ jmp %%_initial_blocks_encrypted
+%endif
+%assign number_of_blocks (number_of_blocks - 1)
+%endrep
+
+%%_initial_blocks_encrypted:
+ vmovdqa64 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCKx
+
+ ;; move cipher blocks from intial blocks to input of by8 macro
+ ;; and for GHASH_LAST_8/7
+ ;; - ghash value already xor'ed into block 0
+ vmovdqa64 %%BLK0, %%ZTMP0
+ vmovdqa64 %%BLK1, %%ZTMP1
+
+ ;; The entire message cannot get processed in INITIAL_BLOCKS
+ ;; - GCM_ENC_DEC_SMALL handles up to 16 blocks
+ ;; - INITIAL_BLOCKS processes up to 15 blocks
+ ;; - no need to check for zero length at this stage
+
+ ;; In order to have only one reduction at the end
+ ;; start HASH KEY pointer needs to be determined based on length and
+ ;; call type.
+ ;; - note that 8 blocks are already ciphered in INITIAL_BLOCKS and
+ ;; subtracted from LENGTH
+ lea %%IA1, [%%LENGTH + (8 * 16)]
+ add %%IA1, 15
+ and %%IA1, 0x3f0
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; if partial block and multi_call then change hash key start by one
+ mov %%IA0, %%LENGTH
+ and %%IA0, 15
+ add %%IA0, 15
+ and %%IA0, 16
+ sub %%IA1, %%IA0
+%endif
+ lea %%HASHK_PTR, [%%GDATA_KEY + HashKey + 16]
+ sub %%HASHK_PTR, %%IA1
+ ;; HASHK_PTR
+ ;; - points at the first hash key to start GHASH with
+ ;; - needs to be updated as the message is processed (incremented)
+
+ ;; pre-load constants
+ vmovdqa64 %%ADD8BE, [rel ddq_addbe_8888]
+ vmovdqa64 %%ADD8LE, [rel ddq_add_8888]
+ vpxorq %%GH, %%GH
+ vpxorq %%GL, %%GL
+ vpxorq %%GM, %%GM
+
+ ;; prepare counter 8 blocks
+ vshufi64x2 %%CTR_BLOCKz, %%CTR_BLOCKz, %%CTR_BLOCKz, 0
+ vpaddd %%CTR_BLOCK2z, %%CTR_BLOCKz, [rel ddq_add_5678]
+ vpaddd %%CTR_BLOCKz, %%CTR_BLOCKz, [rel ddq_add_1234]
+ vpshufb %%CTR_BLOCKz, %%SHUF_MASK
+ vpshufb %%CTR_BLOCK2z, %%SHUF_MASK
+
+ ;; Process 7 full blocks plus a partial block
+ cmp %%LENGTH, 128
+ jl %%_encrypt_by_8_partial
+
+%%_encrypt_by_8_parallel:
+ ;; in_order vs. out_order is an optimization to increment the counter
+ ;; without shuffling it back into little endian.
+ ;; %%CTR_CHECK keeps track of when we need to increment in order so
+ ;; that the carry is handled correctly.
+
+ vmovq %%CTR_CHECK, XWORD(%%CTR_BLOCK_SAVE)
+
+%%_encrypt_by_8_new:
+ and WORD(%%CTR_CHECK), 255
+ add WORD(%%CTR_CHECK), 8
+
+ vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)]
+ vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)]
+
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z,\
+ %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \
+ out_order, %%ENC_DEC, full, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \
+ %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+ %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \
+ %%MASKREG, no_reduction, %%GL, %%GH, %%GM
+
+ add %%HASHK_PTR, (8 * 16)
+ add %%DATA_OFFSET, 128
+ sub %%LENGTH, 128
+ jz %%_encrypt_done
+
+ cmp WORD(%%CTR_CHECK), (256 - 8)
+ jae %%_encrypt_by_8
+
+ vpaddd %%CTR_BLOCKz, %%ADD8BE
+ vpaddd %%CTR_BLOCK2z, %%ADD8BE
+
+ cmp %%LENGTH, 128
+ jl %%_encrypt_by_8_partial
+
+ jmp %%_encrypt_by_8_new
+
+%%_encrypt_by_8:
+ vpshufb %%CTR_BLOCKz, %%SHUF_MASK
+ vpshufb %%CTR_BLOCK2z, %%SHUF_MASK
+ vpaddd %%CTR_BLOCKz, %%ADD8LE
+ vpaddd %%CTR_BLOCK2z, %%ADD8LE
+ vpshufb %%CTR_BLOCKz, %%SHUF_MASK
+ vpshufb %%CTR_BLOCK2z, %%SHUF_MASK
+
+ cmp %%LENGTH, 128
+ jge %%_encrypt_by_8_new
+
+%%_encrypt_by_8_partial:
+ ;; Test to see if we need a by 8 with partial block. At this point
+ ;; bytes remaining should be either zero or between 113-127.
+ ;; 'in_order' shuffle needed to align key for partial block xor.
+ ;; 'out_order' is a little faster because it avoids extra shuffles.
+ ;; - counter blocks for the next 8 blocks are prepared and in BE format
+ ;; - we can go ahead with out_order scenario
+
+ vmovdqu64 %%GH4KEY, [%%HASHK_PTR + (4 * 16)]
+ vmovdqu64 %%GH8KEY, [%%HASHK_PTR + (0 * 16)]
+
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA_KEY, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, \
+ %%DATA_OFFSET, %%CTR_BLOCKz, %%CTR_BLOCK2z, \
+ %%BLK0, %%BLK1, %%AES_PARTIAL_BLOCK, \
+ out_order, %%ENC_DEC, partial, %%IA0, %%IA1, %%LENGTH, %%INSTANCE_TYPE, \
+ %%GH4KEY, %%GH8KEY, %%SHUF_MASK, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+ %%ZTMP7, %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, \
+ %%MASKREG, no_reduction, %%GL, %%GH, %%GM
+
+ add %%HASHK_PTR, (8 * 16)
+ add %%DATA_OFFSET, (128 - 16)
+ sub %%LENGTH, (128 - 16)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ mov [%%GDATA_CTX + PBlockLen], %%LENGTH
+ vmovdqu64 [%%GDATA_CTX + PBlockEncKey], %%AES_PARTIAL_BLOCK
+%endif
+
+%%_encrypt_done:
+ ;; Extract the last counter block in LE format
+ vextracti32x4 XWORD(%%CTR_BLOCK_SAVE), %%CTR_BLOCK2z, 3
+ vpshufb XWORD(%%CTR_BLOCK_SAVE), XWORD(%%SHUF_MASK)
+
+ ;; GHASH last cipher text blocks in xmm1-xmm8
+ ;; - if block 8th is partial in a multi-call path then skip the block
+%ifidn %%INSTANCE_TYPE, multi_call
+ cmp qword [%%GDATA_CTX + PBlockLen], 0
+ jz %%_hash_last_8
+
+ ;; save the 8th partial block as GHASH_LAST_7 will clobber %%BLK1
+ vextracti32x4 XWORD(%%ZTMP7), %%BLK1, 3
+
+ GHASH_LAST_7 %%GDATA_KEY, %%BLK1, %%BLK0, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, \
+ %%AAD_HASHx, %%MASKREG, %%IA0, %%GH, %%GL, %%GM
+
+ ;; XOR the partial word into the hash
+ vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP7)
+ jmp %%_ghash_done
+%%_hash_last_8:
+%endif
+ GHASH_LAST_8 %%GDATA_KEY, %%BLK1, %%BLK0, \
+ %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%AAD_HASHx, \
+ %%GH, %%GL, %%GM
+%%_ghash_done:
+ vmovdqu64 [%%GDATA_CTX + CurCount], XWORD(%%CTR_BLOCK_SAVE)
+ vmovdqu64 [%%GDATA_CTX + AadHash], %%AAD_HASHx
+%%_enc_dec_done:
+
+%endmacro ; GCM_ENC_DEC
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt/decrypt the initial 16 blocks
+%macro INITIAL_BLOCKS_16 22
+%define %%IN %1 ; [in] input buffer
+%define %%OUT %2 ; [in] output buffer
+%define %%KP %3 ; [in] pointer to expanded keys
+%define %%DATA_OFFSET %4 ; [in] data offset
+%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits)
+%define %%CTR %6 ; [in] ZMM with CTR BE blocks 4x128 bits
+%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check
+%define %%ADDBE_4x4 %8 ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234 %9 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%T0 %10 ; [clobered] temporary ZMM register
+%define %%T1 %11 ; [clobered] temporary ZMM register
+%define %%T2 %12 ; [clobered] temporary ZMM register
+%define %%T3 %13 ; [clobered] temporary ZMM register
+%define %%T4 %14 ; [clobered] temporary ZMM register
+%define %%T5 %15 ; [clobered] temporary ZMM register
+%define %%T6 %16 ; [clobered] temporary ZMM register
+%define %%T7 %17 ; [clobered] temporary ZMM register
+%define %%T8 %18 ; [clobered] temporary ZMM register
+%define %%SHUF_MASK %19 ; [in] ZMM with BE/LE shuffle mask
+%define %%ENC_DEC %20 ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%BLK_OFFSET %21 ; [in] stack frame offset to ciphered blocks
+%define %%DATA_DISPL %22 ; [in] fixed numerical data displacement/offset
+
+%define %%B00_03 %%T5
+%define %%B04_07 %%T6
+%define %%B08_11 %%T7
+%define %%B12_15 %%T8
+
+%assign stack_offset (%%BLK_OFFSET)
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; prepare counter blocks
+
+ cmp BYTE(%%CTR_CHECK), (256 - 16)
+ jae %%_next_16_overflow
+ vpaddd %%B00_03, %%CTR, %%ADDBE_1234
+ vpaddd %%B04_07, %%B00_03, %%ADDBE_4x4
+ vpaddd %%B08_11, %%B04_07, %%ADDBE_4x4
+ vpaddd %%B12_15, %%B08_11, %%ADDBE_4x4
+ jmp %%_next_16_ok
+%%_next_16_overflow:
+ vpshufb %%CTR, %%CTR, %%SHUF_MASK
+ vmovdqa64 %%B12_15, [rel ddq_add_4444]
+ vpaddd %%B00_03, %%CTR, [rel ddq_add_1234]
+ vpaddd %%B04_07, %%B00_03, %%B12_15
+ vpaddd %%B08_11, %%B04_07, %%B12_15
+ vpaddd %%B12_15, %%B08_11, %%B12_15
+ vpshufb %%B00_03, %%SHUF_MASK
+ vpshufb %%B04_07, %%SHUF_MASK
+ vpshufb %%B08_11, %%SHUF_MASK
+ vpshufb %%B12_15, %%SHUF_MASK
+%%_next_16_ok:
+ vshufi64x2 %%CTR, %%B12_15, %%B12_15, 1111_1111b
+ add BYTE(%%CTR_CHECK), 16
+
+ ;; === load 16 blocks of data
+ VX512LDR %%T0, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*0)]
+ VX512LDR %%T1, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*1)]
+ VX512LDR %%T2, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*2)]
+ VX512LDR %%T3, [%%IN + %%DATA_OFFSET + %%DATA_DISPL + (64*3)]
+
+ ;; move to AES encryption rounds
+%assign i 0
+ vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+ vpxorq %%B00_03, %%B00_03, %%T4
+ vpxorq %%B04_07, %%B04_07, %%T4
+ vpxorq %%B08_11, %%B08_11, %%T4
+ vpxorq %%B12_15, %%B12_15, %%T4
+%assign i (i + 1)
+
+%rep NROUNDS
+ vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+ vaesenc %%B00_03, %%B00_03, %%T4
+ vaesenc %%B04_07, %%B04_07, %%T4
+ vaesenc %%B08_11, %%B08_11, %%T4
+ vaesenc %%B12_15, %%B12_15, %%T4
+%assign i (i + 1)
+%endrep
+
+ vbroadcastf64x2 %%T4, [%%KP + (16*i)]
+ vaesenclast %%B00_03, %%B00_03, %%T4
+ vaesenclast %%B04_07, %%B04_07, %%T4
+ vaesenclast %%B08_11, %%B08_11, %%T4
+ vaesenclast %%B12_15, %%B12_15, %%T4
+
+ ;; xor against text
+ vpxorq %%B00_03, %%B00_03, %%T0
+ vpxorq %%B04_07, %%B04_07, %%T1
+ vpxorq %%B08_11, %%B08_11, %%T2
+ vpxorq %%B12_15, %%B12_15, %%T3
+
+ ;; store
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*0)], %%B00_03
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*1)], %%B04_07
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*2)], %%B08_11
+ VX512STR [%%OUT + %%DATA_OFFSET + %%DATA_DISPL + (64*3)], %%B12_15
+
+%ifidn %%ENC_DEC, DEC
+ ;; decryption - cipher text needs to go to GHASH phase
+ vpshufb %%B00_03, %%T0, %%SHUF_MASK
+ vpshufb %%B04_07, %%T1, %%SHUF_MASK
+ vpshufb %%B08_11, %%T2, %%SHUF_MASK
+ vpshufb %%B12_15, %%T3, %%SHUF_MASK
+%else
+ ;; encryption
+ vpshufb %%B00_03, %%B00_03, %%SHUF_MASK
+ vpshufb %%B04_07, %%B04_07, %%SHUF_MASK
+ vpshufb %%B08_11, %%B08_11, %%SHUF_MASK
+ vpshufb %%B12_15, %%B12_15, %%SHUF_MASK
+%endif
+
+%ifnidn %%GHASH, no_ghash
+ ;; === xor cipher block 0 with GHASH for the next GHASH round
+ vpxorq %%B00_03, %%B00_03, %%GHASH
+%endif
+
+ vmovdqa64 [rsp + stack_offset + (0 * 64)], %%B00_03
+ vmovdqa64 [rsp + stack_offset + (1 * 64)], %%B04_07
+ vmovdqa64 [rsp + stack_offset + (2 * 64)], %%B08_11
+ vmovdqa64 [rsp + stack_offset + (3 * 64)], %%B12_15
+%endmacro ;INITIAL_BLOCKS_16
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt the initial N x 16 blocks
+;;; - A x 16 blocks are encrypted/decrypted first (pipeline depth)
+;;; - B x 16 blocks are encrypted/decrypted and previous A x 16 are ghashed
+;;; - A + B = N
+%macro INITIAL_BLOCKS_Nx16 39
+%define %%IN %1 ; [in] input buffer
+%define %%OUT %2 ; [in] output buffer
+%define %%KP %3 ; [in] pointer to expanded keys
+%define %%DATA_OFFSET %4 ; [in/out] data offset
+%define %%GHASH %5 ; [in] ZMM with AAD (low 128 bits)
+%define %%CTR %6 ; [in/out] ZMM with CTR: in - LE & 128b; out - BE & 4x128b
+%define %%CTR_CHECK %7 ; [in/out] GPR with counter overflow check
+%define %%T0 %8 ; [clobered] temporary ZMM register
+%define %%T1 %9 ; [clobered] temporary ZMM register
+%define %%T2 %10 ; [clobered] temporary ZMM register
+%define %%T3 %11 ; [clobered] temporary ZMM register
+%define %%T4 %12 ; [clobered] temporary ZMM register
+%define %%T5 %13 ; [clobered] temporary ZMM register
+%define %%T6 %14 ; [clobered] temporary ZMM register
+%define %%T7 %15 ; [clobered] temporary ZMM register
+%define %%T8 %16 ; [clobered] temporary ZMM register
+%define %%T9 %17 ; [clobered] temporary ZMM register
+%define %%T10 %18 ; [clobered] temporary ZMM register
+%define %%T11 %19 ; [clobered] temporary ZMM register
+%define %%T12 %20 ; [clobered] temporary ZMM register
+%define %%T13 %21 ; [clobered] temporary ZMM register
+%define %%T14 %22 ; [clobered] temporary ZMM register
+%define %%T15 %23 ; [clobered] temporary ZMM register
+%define %%T16 %24 ; [clobered] temporary ZMM register
+%define %%T17 %25 ; [clobered] temporary ZMM register
+%define %%T18 %26 ; [clobered] temporary ZMM register
+%define %%T19 %27 ; [clobered] temporary ZMM register
+%define %%T20 %28 ; [clobered] temporary ZMM register
+%define %%T21 %29 ; [clobered] temporary ZMM register
+%define %%T22 %30 ; [clobered] temporary ZMM register
+%define %%GH %31 ; [out] ZMM ghash sum (high)
+%define %%GL %32 ; [out] ZMM ghash sum (low)
+%define %%GM %33 ; [out] ZMM ghash sum (middle)
+%define %%ADDBE_4x4 %34 ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234 %35 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%SHUF_MASK %36 ; [in] ZMM with BE/LE shuffle mask
+%define %%ENC_DEC %37 ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%NBLOCKS %38 ; [in] number of blocks: multiple of 16
+%define %%DEPTH_BLK %39 ; [in] pipline depth, number of blocks (mulitple of 16)
+
+%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign hkey_offset HashKey_ %+ %%NBLOCKS
+%assign data_in_out_offset 0
+
+ ;; set up CTR_CHECK
+ vmovd DWORD(%%CTR_CHECK), XWORD(%%CTR)
+ and DWORD(%%CTR_CHECK), 255
+
+ ;; in LE format after init, convert to BE
+ vshufi64x2 %%CTR, %%CTR, %%CTR, 0
+ vpshufb %%CTR, %%CTR, %%SHUF_MASK
+
+ ;; ==== AES lead in
+
+ ;; first 16 blocks - just cipher
+ INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \
+ %%GHASH, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \
+ %%T0, %%T1, %%T2, %%T3, %%T4, \
+ %%T5, %%T6, %%T7, %%T8, \
+ %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+
+%if (%%DEPTH_BLK > 16)
+%rep ((%%DEPTH_BLK - 16) / 16)
+ INITIAL_BLOCKS_16 %%IN, %%OUT, %%KP, %%DATA_OFFSET, \
+ no_ghash, %%CTR, %%CTR_CHECK, %%ADDBE_4x4, %%ADDBE_1234, \
+ %%T0, %%T1, %%T2, %%T3, %%T4, \
+ %%T5, %%T6, %%T7, %%T8, \
+ %%SHUF_MASK, %%ENC_DEC, aesout_offset, data_in_out_offset
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%endrep
+%endif
+
+ ;; ==== GHASH + AES follows
+
+ ;; first 16 blocks stitched
+ GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \
+ %%T0, %%T1, %%T2, %%T3, \
+ %%T4, %%T5, %%T6, %%T7, \
+ %%T8, %%T9, %%T10, %%T11,\
+ %%T12, %%T13, %%T14, %%T15,\
+ %%T16, %%T17, %%T18, %%T19, \
+ %%T20, %%T21, %%T22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GL, %%GH, %%GM, \
+ first_time, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%if ((%%NBLOCKS - %%DEPTH_BLK) > 16)
+%rep ((%%NBLOCKS - %%DEPTH_BLK - 16) / 16)
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+
+ ;; mid 16 blocks - stitched
+ GHASH_16_ENCRYPT_16_PARALLEL %%KP, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHUF_MASK, \
+ %%T0, %%T1, %%T2, %%T3, \
+ %%T4, %%T5, %%T6, %%T7, \
+ %%T8, %%T9, %%T10, %%T11,\
+ %%T12, %%T13, %%T14, %%T15,\
+ %%T16, %%T17, %%T18, %%T19, \
+ %%T20, %%T21, %%T22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GL, %%GH, %%GM, \
+ no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+%endrep
+%endif
+ add %%DATA_OFFSET, (%%NBLOCKS * 16)
+
+%endmacro ;INITIAL_BLOCKS_Nx16
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; GHASH the last 16 blocks of cipher text (last part of by 32/64/128 code)
+%macro GHASH_LAST_Nx16 23
+%define %%KP %1 ; [in] pointer to expanded keys
+%define %%GHASH %2 ; [out] ghash output
+%define %%T1 %3 ; [clobbered] temporary ZMM
+%define %%T2 %4 ; [clobbered] temporary ZMM
+%define %%T3 %5 ; [clobbered] temporary ZMM
+%define %%T4 %6 ; [clobbered] temporary ZMM
+%define %%T5 %7 ; [clobbered] temporary ZMM
+%define %%T6 %8 ; [clobbered] temporary ZMM
+%define %%T7 %9 ; [clobbered] temporary ZMM
+%define %%T8 %10 ; [clobbered] temporary ZMM
+%define %%T9 %11 ; [clobbered] temporary ZMM
+%define %%T10 %12 ; [clobbered] temporary ZMM
+%define %%T11 %13 ; [clobbered] temporary ZMM
+%define %%T12 %14 ; [clobbered] temporary ZMM
+%define %%T13 %15 ; [clobbered] temporary ZMM
+%define %%T14 %16 ; [clobbered] temporary ZMM
+%define %%T15 %17 ; [clobbered] temporary ZMM
+%define %%T16 %18 ; [clobbered] temporary ZMM
+%define %%GH %19 ; [in/cloberred] ghash sum (high)
+%define %%GL %20 ; [in/cloberred] ghash sum (low)
+%define %%GM %21 ; [in/cloberred] ghash sum (medium)
+%define %%LOOP_BLK %22 ; [in] numerical number of blocks handled by the loop
+%define %%DEPTH_BLK %23 ; [in] numerical number, pipeline depth (ghash vs aes)
+
+%define %%T0H %%T1
+%define %%T0L %%T2
+%define %%T0M1 %%T3
+%define %%T0M2 %%T4
+
+%define %%T1H %%T5
+%define %%T1L %%T6
+%define %%T1M1 %%T7
+%define %%T1M2 %%T8
+
+%define %%T2H %%T9
+%define %%T2L %%T10
+%define %%T2M1 %%T11
+%define %%T2M2 %%T12
+
+%define %%BLK1 %%T13
+%define %%BLK2 %%T14
+
+%define %%HK1 %%T15
+%define %%HK2 %%T16
+
+%assign hashk HashKey_ %+ %%DEPTH_BLK
+%assign cipher_blk (STACK_LOCAL_OFFSET + ((%%LOOP_BLK - %%DEPTH_BLK) * 16))
+
+ ;; load cipher blocks and ghash keys
+ vmovdqa64 %%BLK1, [rsp + cipher_blk]
+ vmovdqa64 %%BLK2, [rsp + cipher_blk + 64]
+ vmovdqu64 %%HK1, [%%KP + hashk]
+ vmovdqu64 %%HK2, [%%KP + hashk + 64]
+ ;; ghash blocks 0-3
+ vpclmulqdq %%T0H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1
+ vpclmulqdq %%T0L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0
+ vpclmulqdq %%T0M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq %%T0M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1
+ ;; ghash blocks 4-7
+ vpclmulqdq %%T1H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1
+ vpclmulqdq %%T1L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0
+ vpclmulqdq %%T1M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0
+ vpclmulqdq %%T1M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1
+ vpternlogq %%T0H, %%T1H, %%GH, 0x96 ; T0H = T0H + T1H + GH
+ vpternlogq %%T0L, %%T1L, %%GL, 0x96 ; T0L = T0L + T1L + GL
+ vpternlogq %%T0M1, %%T1M1, %%GM, 0x96 ; T0M1 = T0M1 + T1M1 + GM
+ vpxorq %%T0M2, %%T0M2, %%T1M2 ; T0M2 = T0M2 + T1M2
+
+%rep ((%%DEPTH_BLK - 8) / 8)
+%assign hashk (hashk + 128)
+%assign cipher_blk (cipher_blk + 128)
+
+ ;; remaining blocks
+ ;; load next 8 cipher blocks and corresponding ghash keys
+ vmovdqa64 %%BLK1, [rsp + cipher_blk]
+ vmovdqa64 %%BLK2, [rsp + cipher_blk + 64]
+ vmovdqu64 %%HK1, [%%KP + hashk]
+ vmovdqu64 %%HK2, [%%KP + hashk + 64]
+ ;; ghash blocks 0-3
+ vpclmulqdq %%T1H, %%BLK1, %%HK1, 0x11 ; %%TH = a1*b1
+ vpclmulqdq %%T1L, %%BLK1, %%HK1, 0x00 ; %%TL = a0*b0
+ vpclmulqdq %%T1M1, %%BLK1, %%HK1, 0x01 ; %%TM1 = a1*b0
+ vpclmulqdq %%T1M2, %%BLK1, %%HK1, 0x10 ; %%TM2 = a0*b1
+ ;; ghash blocks 4-7
+ vpclmulqdq %%T2H, %%BLK2, %%HK2, 0x11 ; %%TTH = a1*b1
+ vpclmulqdq %%T2L, %%BLK2, %%HK2, 0x00 ; %%TTL = a0*b0
+ vpclmulqdq %%T2M1, %%BLK2, %%HK2, 0x01 ; %%TTM1 = a1*b0
+ vpclmulqdq %%T2M2, %%BLK2, %%HK2, 0x10 ; %%TTM2 = a0*b1
+ ;; update sums
+ vpternlogq %%T0H, %%T1H, %%T2H, 0x96 ; TH = T0H + T1H + T2H
+ vpternlogq %%T0L, %%T1L, %%T2L, 0x96 ; TL = T0L + T1L + T2L
+ vpternlogq %%T0M1, %%T1M1, %%T2M1, 0x96 ; TM1 = T0M1 + T1M1 xor T2M1
+ vpternlogq %%T0M2, %%T1M2, %%T2M2, 0x96 ; TM2 = T0M2 + T1M1 xor T2M2
+%endrep
+
+ ;; integrate TM into TH and TL
+ vpxorq %%T0M1, %%T0M1, %%T0M2
+ vpsrldq %%T1M1, %%T0M1, 8
+ vpslldq %%T1M2, %%T0M1, 8
+ vpxorq %%T0H, %%T0H, %%T1M1
+ vpxorq %%T0L, %%T0L, %%T1M2
+
+ ;; add TH and TL 128-bit words horizontally
+ VHPXORI4x128 %%T0H, %%T2M1
+ VHPXORI4x128 %%T0L, %%T2M2
+
+ ;; reduction
+ vmovdqa64 %%HK1, [rel POLY2]
+ VCLMUL_REDUCE %%GHASH, %%HK1, %%T0H, %%T0L, %%T0M1, %%T0M2
+%endmacro
+
+;;; ===========================================================================
+;;; ===========================================================================
+;;; Encrypt & ghash multiples of 16 blocks
+
+%macro GHASH_ENCRYPT_Nx16_PARALLEL 39
+%define %%IN %1 ; [in] input buffer
+%define %%OUT %2 ; [in] output buffer
+%define %%GDATA_KEY %3 ; [in] pointer to expanded keys
+%define %%DATA_OFFSET %4 ; [in/out] data offset
+%define %%CTR_BE %5 ; [in/out] ZMM last counter block
+%define %%SHFMSK %6 ; [in] ZMM with byte swap mask for pshufb
+%define %%ZT0 %7 ; [clobered] temporary ZMM register
+%define %%ZT1 %8 ; [clobered] temporary ZMM register
+%define %%ZT2 %9 ; [clobered] temporary ZMM register
+%define %%ZT3 %10 ; [clobered] temporary ZMM register
+%define %%ZT4 %11 ; [clobered] temporary ZMM register
+%define %%ZT5 %12 ; [clobered] temporary ZMM register
+%define %%ZT6 %13 ; [clobered] temporary ZMM register
+%define %%ZT7 %14 ; [clobered] temporary ZMM register
+%define %%ZT8 %15 ; [clobered] temporary ZMM register
+%define %%ZT9 %16 ; [clobered] temporary ZMM register
+%define %%ZT10 %17 ; [clobered] temporary ZMM register
+%define %%ZT11 %18 ; [clobered] temporary ZMM register
+%define %%ZT12 %19 ; [clobered] temporary ZMM register
+%define %%ZT13 %20 ; [clobered] temporary ZMM register
+%define %%ZT14 %21 ; [clobered] temporary ZMM register
+%define %%ZT15 %22 ; [clobered] temporary ZMM register
+%define %%ZT16 %23 ; [clobered] temporary ZMM register
+%define %%ZT17 %24 ; [clobered] temporary ZMM register
+%define %%ZT18 %25 ; [clobered] temporary ZMM register
+%define %%ZT19 %26 ; [clobered] temporary ZMM register
+%define %%ZT20 %27 ; [clobered] temporary ZMM register
+%define %%ZT21 %28 ; [clobered] temporary ZMM register
+%define %%ZT22 %29 ; [clobered] temporary ZMM register
+%define %%GTH %30 ; [in/out] ZMM GHASH sum (high)
+%define %%GTL %31 ; [in/out] ZMM GHASH sum (low)
+%define %%GTM %32 ; [in/out] ZMM GHASH sum (medium)
+%define %%ADDBE_4x4 %33 ; [in] ZMM 4x128bits with value 4 (big endian)
+%define %%ADDBE_1234 %34 ; [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
+%define %%GHASH %35 ; [clobbered] ZMM with intermidiate GHASH value
+%define %%ENC_DEC %36 ; [in] ENC (encrypt) or DEC (decrypt) selector
+%define %%NUM_BLOCKS %37 ; [in] number of blocks to process in the loop
+%define %%DEPTH_BLK %38 ; [in] pipeline depth in blocks
+%define %%CTR_CHECK %39 ; [in/out] counter to check byte overflow
+
+%assign aesout_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + ((%%NUM_BLOCKS - %%DEPTH_BLK) * 16))
+%assign hkey_offset HashKey_ %+ %%DEPTH_BLK
+%assign data_in_out_offset 0
+
+ ;; mid 16 blocks
+%if (%%DEPTH_BLK > 16)
+%rep ((%%DEPTH_BLK - 16) / 16)
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%endrep
+%endif
+
+ ;; 16 blocks with reduction
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ HashKey_16, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ final_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%assign ghashin_offset (STACK_LOCAL_OFFSET + (0 * 16))
+%assign hkey_offset HashKey_ %+ %%NUM_BLOCKS
+
+ ;; === xor cipher block 0 with GHASH (ZT4)
+ vmovdqa64 %%GHASH, %%ZT4
+
+ ;; start the pipeline again
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ first_time, %%ENC_DEC, data_in_out_offset, %%GHASH
+
+%if ((%%NUM_BLOCKS - %%DEPTH_BLK) > 16)
+%rep ((%%NUM_BLOCKS - %%DEPTH_BLK - 16 ) / 16)
+
+%assign aesout_offset (aesout_offset + (16 * 16))
+%assign data_in_out_offset (data_in_out_offset + (16 * 16))
+%assign ghashin_offset (ghashin_offset + (16 * 16))
+%assign hkey_offset (hkey_offset + (16 * 16))
+
+ GHASH_16_ENCRYPT_16_PARALLEL %%GDATA_KEY, %%OUT, %%IN, %%DATA_OFFSET, \
+ %%CTR_BE, %%CTR_CHECK, \
+ hkey_offset, aesout_offset, ghashin_offset, %%SHFMSK, \
+ %%ZT0, %%ZT1, %%ZT2, %%ZT3, \
+ %%ZT4, %%ZT5, %%ZT6, %%ZT7, \
+ %%ZT8, %%ZT9, %%ZT10, %%ZT11,\
+ %%ZT12, %%ZT13, %%ZT14, %%ZT15,\
+ %%ZT16, %%ZT17, %%ZT18, %%ZT19, \
+ %%ZT20, %%ZT21, %%ZT22, \
+ %%ADDBE_4x4, %%ADDBE_1234, \
+ %%GTL, %%GTH, %%GTM, \
+ no_reduction, %%ENC_DEC, data_in_out_offset, no_ghash_in
+%endrep
+%endif
+
+ add %%DATA_OFFSET, (%%NUM_BLOCKS * 16)
+
+%endmacro ;GHASH_ENCRYPT_Nx16_PARALLEL
+;;; ===========================================================================
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_key_data * (GDATA_KEY), gcm_context_data (GDATA_CTX) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 6
+%define %%GDATA_KEY %1
+%define %%GDATA_CTX %2
+%define %%AUTH_TAG %3
+%define %%AUTH_TAG_LEN %4
+%define %%ENC_DEC %5
+%define %%INSTANCE_TYPE %6
+%define %%PLAIN_CYPH_LEN rax
+
+ vmovdqu xmm13, [%%GDATA_KEY + HashKey]
+ ;; Start AES as early as possible
+ vmovdqu xmm9, [%%GDATA_CTX + OrigIV] ; xmm9 = Y0
+ ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, xmm9 ; E(K, Y0)
+
+%ifidn %%INSTANCE_TYPE, multi_call
+ ;; If the GCM function is called as a single function call rather
+ ;; than invoking the individual parts (init, update, finalize) we
+ ;; can remove a write to read dependency on AadHash.
+ vmovdqu xmm14, [%%GDATA_CTX + AadHash]
+
+ ;; Encrypt the final partial block. If we did this as a single call then
+ ;; the partial block was handled in the main GCM_ENC_DEC macro.
+ mov r12, [%%GDATA_CTX + PBlockLen]
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA_CTX + AadHash], xmm14
+
+%%_partial_done:
+
+%endif
+
+ mov r12, [%%GDATA_CTX + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA_CTX + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6
+ vpshufb xmm14, [rel SHUF_MASK] ; perform a 16Byte swap
+
+ vpxor xmm9, xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+ cmp r11, 8
+ je %%_T_8
+
+ simd_store_avx_15 r10, xmm9, r11, r12, rax
+ jmp %%_return_T_done
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+
+%ifdef SAFE_DATA
+ ;; Clear sensitive data from context structure
+ vpxor xmm0, xmm0
+ vmovdqu [%%GDATA_CTX + AadHash], xmm0
+ vmovdqu [%%GDATA_CTX + PBlockEncKey], xmm0
+%endif
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_precomp_128_vaes_avx512 /
+; aes_gcm_precomp_192_vaes_avx512 /
+; aes_gcm_precomp_256_vaes_avx512
+; (struct gcm_key_data *key_data)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(precomp,_)
+FN_NAME(precomp,_):
+ endbranch
+;; Parameter is passed through register
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_precomp
+%endif
+
+ FUNC_SAVE
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [rel SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [rel TWOONE]
+ vpand xmm2, xmm2, [rel POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+ FUNC_RESTORE
+exit_precomp:
+
+ ret
+%endif ; _nt
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_init_128_vaes_avx512 / aes_gcm_init_192_vaes_avx512 / aes_gcm_init_256_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(init,_)
+FN_NAME(init,_):
+ endbranch
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_init
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_init
+
+ ;; Check IV != NULL
+ cmp arg3, 0
+ jz exit_init
+
+ ;; Check if aad_len == 0
+ cmp arg5, 0
+ jz skip_aad_check_init
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg4, 0
+ jz exit_init
+
+skip_aad_check_init:
+%endif
+ GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12, k1, xmm14, xmm2, \
+ zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+
+exit_init:
+
+ FUNC_RESTORE
+ ret
+%endif ; _nt
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_update_vaes_avx512 / aes_gcm_enc_192_update_vaes_avx512 /
+; aes_gcm_enc_256_update_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_update_)
+FN_NAME(enc,_update_):
+ endbranch
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_enc
+
+skip_in_out_check_update_enc:
+%endif
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call
+
+exit_update_enc:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_update_vaes_avx512 / aes_gcm_dec_192_update_vaes_avx512 /
+; aes_gcm_dec_256_update_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_update_)
+FN_NAME(dec,_update_):
+ endbranch
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_update_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_update_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_update_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_update_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_update_dec
+
+skip_in_out_check_update_dec:
+%endif
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call
+
+exit_update_dec:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_finalize_vaes_avx512 / aes_gcm_enc_192_finalize_vaes_avx512 /
+; aes_gcm_enc_256_finalize_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(enc,_finalize_)
+FN_NAME(enc,_finalize_):
+ endbranch
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_enc_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_enc_fin
+
+ cmp arg4, 16
+ ja exit_enc_fin
+%endif
+
+ FUNC_SAVE
+ GCM_COMPLETE arg1, arg2, arg3, arg4, ENC, multi_call
+
+ FUNC_RESTORE
+
+exit_enc_fin:
+ ret
+%endif ; _nt
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_finalize_vaes_avx512 / aes_gcm_dec_192_finalize_vaes_avx512
+; aes_gcm_dec_256_finalize_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifnidn FUNCT_EXTENSION, _nt
+global FN_NAME(dec,_finalize_)
+FN_NAME(dec,_finalize_):
+ endbranch
+
+;; All parameters are passed through registers
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec_fin
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag != NULL
+ cmp arg3, 0
+ jz exit_dec_fin
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg4, 0
+ jz exit_dec_fin
+
+ cmp arg4, 16
+ ja exit_dec_fin
+%endif
+
+ FUNC_SAVE
+ GCM_COMPLETE arg1, arg2, arg3, arg4, DEC, multi_call
+
+ FUNC_RESTORE
+
+exit_dec_fin:
+ ret
+%endif ; _nt
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_enc_128_vaes_avx512 / aes_gcm_enc_192_vaes_avx512 / aes_gcm_enc_256_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(enc,_)
+FN_NAME(enc,_):
+ endbranch
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_enc
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_enc
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_enc
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_enc
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_enc
+
+ cmp arg10, 16
+ ja exit_enc
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_enc
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_enc
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_enc
+
+skip_in_out_check_enc:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_enc
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_enc
+
+skip_aad_check_enc:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \
+ zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call
+ GCM_COMPLETE arg1, arg2, arg9, arg10, ENC, single_call
+
+exit_enc:
+ FUNC_RESTORE
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aes_gcm_dec_128_vaes_avx512 / aes_gcm_dec_192_vaes_avx512 / aes_gcm_dec_256_vaes_avx512
+; (const struct gcm_key_data *key_data,
+; struct gcm_context_data *context_data,
+; u8 *out,
+; const u8 *in,
+; u64 plaintext_len,
+; u8 *iv,
+; const u8 *aad,
+; u64 aad_len,
+; u8 *auth_tag,
+; u64 auth_tag_len);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global FN_NAME(dec,_)
+FN_NAME(dec,_):
+ endbranch
+
+ FUNC_SAVE
+
+%ifdef SAFE_PARAM
+ ;; Check key_data != NULL
+ cmp arg1, 0
+ jz exit_dec
+
+ ;; Check context_data != NULL
+ cmp arg2, 0
+ jz exit_dec
+
+ ;; Check IV != NULL
+ cmp arg6, 0
+ jz exit_dec
+
+ ;; Check auth_tag != NULL
+ cmp arg9, 0
+ jz exit_dec
+
+ ;; Check auth_tag_len == 0 or > 16
+ cmp arg10, 0
+ jz exit_dec
+
+ cmp arg10, 16
+ ja exit_dec
+
+ ;; Check if plaintext_len == 0
+ cmp arg5, 0
+ jz skip_in_out_check_dec
+
+ ;; Check out != NULL (plaintext_len != 0)
+ cmp arg3, 0
+ jz exit_dec
+
+ ;; Check in != NULL (plaintext_len != 0)
+ cmp arg4, 0
+ jz exit_dec
+
+skip_in_out_check_dec:
+ ;; Check if aad_len == 0
+ cmp arg8, 0
+ jz skip_aad_check_dec
+
+ ;; Check aad != NULL (aad_len != 0)
+ cmp arg7, 0
+ jz exit_dec
+
+skip_aad_check_dec:
+%endif
+ GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \
+ zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call
+ GCM_COMPLETE arg1, arg2, arg9, arg10, DEC, single_call
+
+exit_dec:
+ FUNC_RESTORE
+ ret
+
+%else ; Assembler doesn't understand these opcodes. Add empty symbol for windows.
+%ifidn __OUTPUT_FORMAT__, win64
+global no_ %+ FN_NAME(avx512,_)
+no_ %+ FN_NAME(avx512,_) %+ :
+%endif
+%endif ; (AS_FEATURE_LEVEL) >= 10
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h
new file mode 100644
index 000000000..8287198ae
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h
@@ -0,0 +1,476 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_GCM_VECTORS_H_
+#define AES_GCM_VECTORS_H_
+
+#include <stdint.h>
+
+typedef enum gcm_key_size { BITS_128 = 16, BITS_256 = 32 } gcm_key_size;
+#define KBITS(K) (sizeof(K))
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+typedef struct gcm_vector {
+ uint8_t* K; // AES Key
+ gcm_key_size Klen; // length of key in bits
+ uint8_t* IV; // initial value used by GCM
+ uint64_t IVlen; // length of IV in bytes
+ uint8_t* A; // additional authenticated data
+ uint64_t Alen; // length of AAD in bytes
+ uint8_t* P; // Plain text
+ uint64_t Plen; // length of our plaintext
+ //outputs of encryption
+ uint8_t* C; // same length as PT
+ uint8_t* T; // Authentication tag
+ uint8_t Tlen; // AT length can be 0 to 128bits
+} gcm_vector;
+
+///////
+// 60-Byte Packet Encryption Using GCM-AES-128
+// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf
+// K: AD7A2BD03EAC835A6F620FDCB506B345
+// IV: 12153524C0895E81B2C28465
+// AAD: D609B1F056637A0D46DF998D88E52E00
+// B2C2846512153524C0895E81
+// P: 08000F101112131415161718191A1B1C
+// 1D1E1F202122232425262728292A2B2C
+// 2D2E2F303132333435363738393A0002
+// C: 701AFA1CC039C0D765128A665DAB6924
+// 3899BF7318CCDC81C9931DA17FBE8EDD
+// 7D17CB8B4C26FC81E3284F2B7FBA713D
+// AT: 4F8D55E7D3F06FD5A13C0C29B9D5B880
+// H: 73A23D80121DE2D5A850253FCF43120E
+///////
+static uint8_t K1[] = {0xAD, 0x7A, 0x2B, 0xD0, 0x3E, 0xAC, 0x83, 0x5A, 0x6F, 0x62, 0x0F, 0xDC, 0xB5, 0x06, 0xB3, 0x45};
+static uint8_t P1[] = {
+ 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C
+ , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C
+ , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x00, 0x02
+};
+static uint8_t IV1[] = {0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81, 0xB2, 0xC2, 0x84, 0x65};
+static uint8_t A1[] = {
+ 0xD6, 0x09, 0xB1, 0xF0, 0x56, 0x63, 0x7A, 0x0D, 0x46, 0xDF, 0x99, 0x8D, 0x88, 0xE5, 0x2E, 0x00
+ , 0xB2, 0xC2, 0x84, 0x65, 0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81
+};
+#define A1_len sizeof(A1)
+static uint8_t C1[] = {
+ 0x70, 0x1A, 0xFA, 0x1C, 0xC0, 0x39, 0xC0, 0xD7, 0x65, 0x12, 0x8A, 0x66, 0x5D, 0xAB, 0x69, 0x24
+ , 0x38, 0x99, 0xBF, 0x73, 0x18, 0xCC, 0xDC, 0x81, 0xC9, 0x93, 0x1D, 0xA1, 0x7F, 0xBE, 0x8E, 0xDD
+ , 0x7D, 0x17, 0xCB, 0x8B, 0x4C, 0x26, 0xFC, 0x81, 0xE3, 0x28, 0x4F, 0x2B, 0x7F, 0xBA, 0x71, 0x3D
+};
+static uint8_t T1[] = {
+ 0x4F, 0x8D, 0x55, 0xE7, 0xD3, 0xF0, 0x6F, 0xD5, 0xA1, 0x3C, 0x0C, 0x29, 0xB9, 0xD5, 0xB8, 0x80
+};
+
+
+///////
+// 54-Byte Packet Encryption Using GCM-AES-128
+// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf
+// K: 071B113B0CA743FECCCF3D051F737382
+// IV: F0761E8DCD3D000176D457ED
+// AAD: E20106D7CD0DF0761E8DCD3D88E54C2A
+// 76D457ED
+// P: 08000F101112131415161718191A1B1C
+// 1D1E1F202122232425262728292A2B2C
+// 2D2E2F30313233340004
+// C: 13B4C72B389DC5018E72A171DD85A5D3
+// 752274D3A019FBCAED09A425CD9B2E1C
+// 9B72EEE7C9DE7D52B3F3
+// AT: D6A5284F4A6D3FE22A5D6C2B960494C3
+// H: E4E01725D724C1215C7309AD34539257
+///////
+static uint8_t K2[] = {0x07, 0x1B, 0x11, 0x3B, 0x0C, 0xA7, 0x43, 0xFE, 0xCC, 0xCF, 0x3D, 0x05, 0x1F, 0x73, 0x73, 0x82};
+static uint8_t P2[] = {
+ 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C
+ , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C
+ , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x04
+};
+static uint8_t IV2[] = {0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x00, 0x01, 0x76, 0xD4, 0x57, 0xED};
+//static uint8_t IV1p[] = {0, 0, 0, 1};
+static uint8_t A2[] = {
+ 0xE2, 0x01, 0x06, 0xD7, 0xCD, 0x0D, 0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x88, 0xE5, 0x4C, 0x2A
+ , 0x76, 0xD4, 0x57, 0xED
+};
+#define A2_len sizeof(A2)
+static uint8_t C2[] = {
+ 0x13, 0xB4, 0xC7, 0x2B, 0x38, 0x9D, 0xC5, 0x01, 0x8E, 0x72, 0xA1, 0x71, 0xDD, 0x85, 0xA5, 0xD3
+ , 0x75, 0x22, 0x74, 0xD3, 0xA0, 0x19, 0xFB, 0xCA, 0xED, 0x09, 0xA4, 0x25, 0xCD, 0x9B, 0x2E, 0x1C
+ , 0x9B, 0x72, 0xEE, 0xE7, 0xC9, 0xDE, 0x7D, 0x52, 0xB3, 0xF3
+};
+static uint8_t T2[] = {
+ 0xD6, 0xA5, 0x28, 0x4F, 0x4A, 0x6D, 0x3F, 0xE2, 0x2A, 0x5D, 0x6C, 0x2B, 0x96, 0x04, 0x94, 0xC3
+};
+
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 128]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K: c939cc13397c1d37de6ae0e1cb7c423c
+// IV: b3d8cc017cbb89b39e0f67e2
+// P: c3b3c41f113a31b73d9a5cd432103069
+// AAD: 24825602bd12a984e0092d3e448eda5f
+// C: 93fe7d9e9bfd10348a5606e5cafa7354
+// AT: 0032a1dc85f1c9786925a2e71d8272dd
+///////
+static uint8_t K3[] = {0xc9, 0x39, 0xcc, 0x13, 0x39, 0x7c, 0x1d, 0x37, 0xde, 0x6a, 0xe0, 0xe1, 0xcb, 0x7c, 0x42, 0x3c};
+static uint8_t IV3[] = {0xb3, 0xd8, 0xcc, 0x01, 0x7c, 0xbb, 0x89, 0xb3, 0x9e, 0x0f, 0x67, 0xe2};
+static uint8_t P3[] = {0xc3, 0xb3, 0xc4, 0x1f, 0x11, 0x3a, 0x31, 0xb7, 0x3d, 0x9a, 0x5c, 0xd4, 0x32, 0x10, 0x30, 0x69};
+static uint8_t A3[] = {0x24, 0x82, 0x56, 0x02, 0xbd, 0x12, 0xa9, 0x84, 0xe0, 0x09, 0x2d, 0x3e, 0x44, 0x8e, 0xda, 0x5f};
+#define A3_len sizeof(A3)
+static uint8_t C3[] = {0x93, 0xfe, 0x7d, 0x9e, 0x9b, 0xfd, 0x10, 0x34, 0x8a, 0x56, 0x06, 0xe5, 0xca, 0xfa, 0x73, 0x54};
+static uint8_t T3[] = {0x00, 0x32, 0xa1, 0xdc, 0x85, 0xf1, 0xc9, 0x78, 0x69, 0x25, 0xa2, 0xe7, 0x1d, 0x82, 0x72, 0xdd};
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 256]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K = 298efa1ccf29cf62ae6824bfc19557fc
+// IV = 6f58a93fe1d207fae4ed2f6d
+// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901
+// AAD = 021fafd238463973ffe80256e5b1c6b1
+// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db
+// T = 542465ef599316f73a7a560509a2d9f2
+///////
+static uint8_t K4[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc};
+static uint8_t IV4[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d};
+static uint8_t P4[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01};
+static uint8_t A4[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1};
+#define A4_len sizeof(A4)
+static uint8_t C4[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb};
+static uint8_t T4[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2};
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 256]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K = 298efa1ccf29cf62ae6824bfc19557fc
+// IV = 6f58a93fe1d207fae4ed2f6d
+// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901
+// AAD = 021fafd238463973ffe80256e5b1c6b1
+// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db
+// T = 542465ef599316f73a7a560509a2d9f2
+///////
+static uint8_t K5[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc};
+static uint8_t IV5[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d};
+static uint8_t P5[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01};
+static uint8_t A5[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1};
+#define A5_len sizeof(A5)
+static uint8_t C5[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb};
+static uint8_t T5[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2};
+
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 2
+// K: 00000000000000000000000000000000
+// P: 00000000000000000000000000000000
+// IV: 000000000000000000000000
+// C: 0388dace60b6a392f328c2b971b2fe78
+// T: ab6e47d42cec13bdf53a67b21257bddf
+// H: 66e94bd4ef8a2c3b884cfa59ca342b2e
+///////
+static uint8_t K6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t P6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t IV6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t A6[] = {0};
+#define A6_len 0
+static uint8_t C6[] = {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78};
+static uint8_t T6[] = {0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd, 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf};
+
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 3
+// K: feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b391aafd255
+// IV: cafebabefacedbaddecaf888
+// H: b83b533708bf535d0aa6e52980d53b78
+// C: 42831ec2217774244b7221b784d0d49c
+// e3aa212f2c02a4e035c17e2329aca12e
+// 21d514b25466931c7d8f6a5aac84aa05
+// 1ba30b396a0aac973d58e091473f5985
+// T: 4d5c2af327cd64a62cf35abd2ba6fab4
+///////
+static uint8_t K7[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P7[] = {0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a
+ , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72
+ , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25
+ , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55};
+static uint8_t IV7[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t A7[] = {0};
+#define A7_len 0
+static uint8_t C7[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c
+ , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e
+ , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05
+ , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85};
+static uint8_t T7[] = {0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6, 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 4
+// K: feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: cafebabefacedbaddecaf888
+// H: b83b533708bf535d0aa6e52980d53b78
+// C: 42831ec2217774244b7221b784d0d49c
+// e3aa212f2c02a4e035c17e2329aca12e
+// 21d514b25466931c7d8f6a5aac84aa05
+// 1ba30b396a0aac973d58e091
+// T: 5bc94fbc3221a5db94fae95ae7121a47
+///////
+static uint8_t K8[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P8[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a
+ , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72
+ , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25
+ , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39};
+static uint8_t A8[] = {0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef
+ , 0xab, 0xad, 0xda, 0xd2};
+#define A8_len sizeof(A8)
+static uint8_t IV8[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C8[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c
+ , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e
+ , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05
+ , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85};
+static uint8_t T8[] = {0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 14
+// K: 00000000000000000000000000000000
+// 00000000000000000000000000000000
+// P: 00000000000000000000000000000000
+// A:
+// IV: 000000000000000000000000
+// H: dc95c078a2408989ad48a21492842087
+// C: cea7403d4d606b6e074ec5d3baf39d18
+// T: d0d1c8a799996bf0265b98b5d48ab919
+///////
+static uint8_t K9[] = {
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static uint8_t P9[] = {
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+};
+static uint8_t A9[] = {0};
+#define A9_len 0
+static uint8_t IV9[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static uint8_t C9[] = {
+ 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18
+};
+static uint8_t T9[] = {0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0, 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 15
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b391aafd255
+// A:
+// IV: cafebabefacedbaddecaf888
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: 522dc1f099567d07f47f37a32a84427d
+// 643a8cdcbfe5c0c97598a2bd2555d1aa
+// 8cb08e48590dbb3da7b08b1056828838
+// c5f61e6393ba7a0abcc9f662898015ad
+// T: b094dac5d93471bdec1a502270e3cc6c
+///////
+static uint8_t K10[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P10[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+static uint8_t A10[] = {0};
+#define A10_len 0
+static uint8_t IV10[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C10[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+static uint8_t T10[] = {
+ 0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd, 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 16
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: cafebabefacedbaddecaf888
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: 522dc1f099567d07f47f37a32a84427d
+// 643a8cdcbfe5c0c97598a2bd2555d1aa
+// 8cb08e48590dbb3da7b08b1056828838
+// c5f61e6393ba7a0abcc9f662
+// T: 76fc6ece0f4e1768cddf8853bb2d551b
+///////
+static uint8_t K11[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P11[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39
+};
+static uint8_t A11[] = {
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xab, 0xad, 0xda, 0xd2};
+#define A11_len sizeof(A11)
+static uint8_t IV11[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C11[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62
+};
+static uint8_t T11[] = {0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68, 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 17 -- Not supported IV length less than 12 bytes
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: cafebabefacedbad
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: c3762df1ca787d32ae47c13bf19844cb
+// af1ae14d0b976afac52ff7d79bba9de0
+// feb582d33934a4f0954cc2363bc73f78
+// 62ac430e64abe499f47c9b1f
+// T: 3a337dbf46a792c45e454913fe2ea8f2
+///////
+//static uint8_t K12[] = {
+// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+//static uint8_t P12[] = {
+// 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+// 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+// 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+// 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39
+//};
+//static uint8_t A12[] = {
+// 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+// 0xab, 0xad, 0xda, 0xd2};
+//static uint8_t IV12[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad};
+//static uint8_t H12[] = {
+// 0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda, 0xd7};
+//static uint8_t C12[] = {
+// 0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32, 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb,
+// 0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa, 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0,
+// 0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0, 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78,
+// 0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99, 0xf4, 0x7c, 0x9b, 0x1f
+//};
+//static uint8_t T12[] = {
+// 0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4, 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 18 -- Not supported IV length greater than 12 bytes
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: 9313225df88406e555909c5aff5269aa
+// 6a7a9538534f7da1e4c303d2a318a728
+// c3c0c95156809539fcf0e2429a6b5254
+// 16aedbf5a0de6a57a637b39b
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: 5a8def2f0c9e53f1f75d7853659e2a20
+// eeb2b22aafde6419a058ab4f6f746bf4
+// 0fc0c3b780f244452da3ebf1c5d82cde
+// a2418997200ef82e44ae7e3f
+// T: a44a8266ee1c8eb0c8b5d4cf5ae9f19a
+///////
+
+
+#define vector(N) {K##N, (KBITS(K##N)), IV##N, sizeof(IV##N), A##N, A##N##_len, P##N, sizeof(P##N), C##N, T##N, sizeof(T##N)}
+
+gcm_vector const gcm_vectors[] = {
+ //field order {K, Klen, IV, IVlen, A, Alen, P, Plen, C, T, Tlen};
+ // original vector does not have a valid sub hash key
+ vector(1),
+ vector(2),
+ vector(3),
+ vector(4),
+ vector(5),
+ vector(6),
+ vector(7),
+ vector(8),
+ vector(9),
+ vector(10),
+ vector(11),
+ /* vector(12), -- IV of less than 16bytes are not supported */
+};
+
+#endif /* AES_GCM_VECTORS_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm
new file mode 100644
index 000000000..ddae6a4e7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm
@@ -0,0 +1,328 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Routine to do AES key expansion
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+%macro key_expansion_128_sse 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+%endmacro
+
+%macro key_expansion_128_avx 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 11111111b
+ vshufps xmm3, xmm3, xmm1, 00010000b
+ vpxor xmm1, xmm1, xmm3
+ vshufps xmm3, xmm3, xmm1, 10001100b
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm2
+%endmacro
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY rdi
+%define EXP_ENC_KEYS rsi
+%define EXP_DEC_KEYS rdx
+%else
+%define KEY rcx
+%define EXP_ENC_KEYS rdx
+%define EXP_DEC_KEYS r8
+%endif
+
+
+; void aes_keyexp_128(UINT8 *key,
+; UINT8 *enc_exp_keys,
+; UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8: pointer to expanded key array for decrypt
+;
+mk_global aes_keyexp_128_sse, function
+aes_keyexp_128_sse:
+ endbranch
+ movdqu xmm1, [KEY] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*0], xmm1
+ movdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory
+ pxor xmm3, xmm3
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*1], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*9], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*2], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*3], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*7], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*4], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*5], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*5], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*6], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*7], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*3], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*8], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*9], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*1], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*10], xmm1
+ movdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+mk_global aes_keyexp_128_avx, function
+aes_keyexp_128_avx:
+ endbranch
+ vmovdqu xmm1, [KEY] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*0], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory
+ vpxor xmm3, xmm3, xmm3
+
+ vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*1], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*9], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*2], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*3], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*7], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*4], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*5], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*5], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*6], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*7], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*3], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*8], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*9], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*1], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*10], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; void aes_keyexp_128_enc_sse(UINT8 *key,
+; UINT8 *enc_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+;
+mk_global aes_keyexp_128_enc_sse, function
+aes_keyexp_128_enc_sse:
+ endbranch
+ movdqu xmm1, [KEY] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*0], xmm1
+ pxor xmm3, xmm3
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*1], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*2], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*3], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*4], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*5], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*6], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*7], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*8], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*9], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*10], xmm1
+
+ ret
+
+mk_global aes_keyexp_128_enc_avx, function
+aes_keyexp_128_enc_avx:
+ endbranch
+ vmovdqu xmm1, [KEY] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*0], xmm1
+ vpxor xmm3, xmm3, xmm3
+
+ vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*1], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*2], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*3], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*4], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*5], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*6], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*7], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*8], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*9], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*10], xmm1
+
+ ret
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm
new file mode 100644
index 000000000..7cde5fb67
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm
@@ -0,0 +1,274 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY rdi
+%define EXP_ENC_KEYS rsi
+%define EXP_DEC_KEYS rdx
+%else
+%define KEY rcx
+%define EXP_ENC_KEYS rdx
+%define EXP_DEC_KEYS r8
+%endif
+
+
+
+
+%macro key_expansion_1_192_sse 1
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu [EXP_ENC_KEYS+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192_sse 1
+ movdqu xmm5, xmm4
+ pslldq xmm5, 4
+ shufps xmm6, xmm1, 11110000b
+ pxor xmm6, xmm5
+ pxor xmm4, xmm6
+ pshufd xmm7, xmm4, 00001110b
+ movdqu [EXP_ENC_KEYS+%1], xmm7
+%endmacro
+
+%macro key_dec_192_sse 1
+ movdqu xmm0, [EXP_ENC_KEYS + 16 * %1]
+ aesimc xmm1, xmm0
+ movdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1
+%endmacro
+
+
+
+
+
+%macro key_expansion_1_192_avx 1
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 11111111b
+ vshufps xmm3, xmm3, xmm1, 00010000b
+ vpxor xmm1, xmm1, xmm3
+ vshufps xmm3, xmm3, xmm1, 10001100b
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vmovdqu [EXP_ENC_KEYS+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192_avx 1
+ vmovdqa xmm5, xmm4
+ vpslldq xmm5, xmm5, 4
+ vshufps xmm6, xmm6, xmm1, 11110000b
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vpshufd xmm7, xmm4, 00001110b
+ vmovdqu [EXP_ENC_KEYS+%1], xmm7
+%endmacro
+
+%macro key_dec_192_avx 1
+ vmovdqu xmm0, [EXP_ENC_KEYS + 16 * %1]
+ vaesimc xmm1, xmm0
+ vmovdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1
+%endmacro
+
+
+
+
+; void aes_keyexp_192(UINT8 *key,
+; UINT8 *enc_exp_keys,
+; UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8: pointer to expanded key array for decrypt
+;
+mk_global aes_keyexp_192_sse, function
+aes_keyexp_192_sse:
+ endbranch
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ sub rsp, 16*2 + 8
+ movdqu [rsp + 0*16], xmm6
+ movdqu [rsp + 1*16], xmm7
+%endif
+
+ movq xmm7, [KEY + 16] ; loading the AES key, 64 bits
+ movq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion
+ pshufd xmm4, xmm7, 01001111b
+ movdqu xmm1, [KEY] ; loading the AES key, 128 bits
+ movdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion
+ movdqu [EXP_DEC_KEYS + 16*0], xmm1
+ movdqu [EXP_DEC_KEYS + 16*12], xmm1
+
+ pxor xmm3, xmm3 ; Set xmm3 to be all zeros. Required for the key_expansion.
+ pxor xmm6, xmm6 ; Set xmm3 to be all zeros. Required for the key_expansion.
+
+ aeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2
+ key_expansion_1_192_sse 24
+ key_expansion_2_192_sse 40
+
+ aeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4
+ key_expansion_1_192_sse 48
+ key_expansion_2_192_sse 64
+
+ aeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5
+ key_expansion_1_192_sse 72
+ key_expansion_2_192_sse 88
+
+ aeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7
+ key_expansion_1_192_sse 96
+ key_expansion_2_192_sse 112
+
+ aeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8
+ key_expansion_1_192_sse 120
+ key_expansion_2_192_sse 136
+
+ aeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10
+ key_expansion_1_192_sse 144
+ key_expansion_2_192_sse 160
+
+ aeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11
+ key_expansion_1_192_sse 168
+ key_expansion_2_192_sse 184
+
+ aeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12
+ key_expansion_1_192_sse 192
+
+;;; we have already saved the 12 th key, which is pure input on the
+;;; ENC key path
+ movdqu xmm0, [EXP_ENC_KEYS + 16 * 12]
+ movdqu [EXP_DEC_KEYS + 16*0], xmm0
+;;; generate remaining decrypt keys
+ key_dec_192_sse 1
+ key_dec_192_sse 2
+ key_dec_192_sse 3
+ key_dec_192_sse 4
+ key_dec_192_sse 5
+ key_dec_192_sse 6
+ key_dec_192_sse 7
+ key_dec_192_sse 8
+ key_dec_192_sse 9
+ key_dec_192_sse 10
+ key_dec_192_sse 11
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ movdqu xmm6, [rsp + 0*16]
+ movdqu xmm7, [rsp + 1*16]
+ add rsp, 16*2 + 8
+%endif
+
+ ret
+
+
+
+mk_global aes_keyexp_192_avx, function
+aes_keyexp_192_avx:
+ endbranch
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ sub rsp, 16*2 + 8
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm7
+%endif
+
+ vmovq xmm7, [KEY + 16] ; loading the AES key, 64 bits
+ vmovq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion
+ vpshufd xmm4, xmm7, 01001111b
+ vmovdqu xmm1, [KEY] ; loading the AES key, 128 bits
+ vmovdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*12], xmm1
+
+ vpxor xmm3, xmm3, xmm3
+ vpxor xmm6, xmm6, xmm6
+
+ vaeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2
+ key_expansion_1_192_avx 24
+ key_expansion_2_192_avx 40
+
+ vaeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4
+ key_expansion_1_192_avx 48
+ key_expansion_2_192_avx 64
+
+ vaeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5
+ key_expansion_1_192_avx 72
+ key_expansion_2_192_avx 88
+
+ vaeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7
+ key_expansion_1_192_avx 96
+ key_expansion_2_192_avx 112
+
+ vaeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8
+ key_expansion_1_192_avx 120
+ key_expansion_2_192_avx 136
+
+ vaeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10
+ key_expansion_1_192_avx 144
+ key_expansion_2_192_avx 160
+
+ vaeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11
+ key_expansion_1_192_avx 168
+ key_expansion_2_192_avx 184
+
+ vaeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12
+ key_expansion_1_192_avx 192
+
+;;; we have already saved the 12 th key, which is pure input on the
+;;; ENC key path
+ vmovdqu xmm0, [EXP_ENC_KEYS + 16 * 12]
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm0
+;;; generate remaining decrypt keys
+ key_dec_192_avx 1
+ key_dec_192_avx 2
+ key_dec_192_avx 3
+ key_dec_192_avx 4
+ key_dec_192_avx 5
+ key_dec_192_avx 6
+ key_dec_192_avx 7
+ key_dec_192_avx 8
+ key_dec_192_avx 9
+ key_dec_192_avx 10
+ key_dec_192_avx 11
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ vmovdqu xmm6, [rsp + 0*16]
+ vmovdqu xmm7, [rsp + 1*16]
+ add rsp, 16*2 + 8
+%endif
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm
new file mode 100644
index 000000000..9b3eb7688
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm
@@ -0,0 +1,286 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+; Routine to do AES key expansion
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+; Uses the f() function of the aeskeygenassist result
+%macro key_expansion_256_sse 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+%endmacro
+
+; Uses the SubWord function of the aeskeygenassist result
+%macro key_expansion_256_sse_2 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 10101010b
+ shufps xmm3, xmm4, 00010000b
+ pxor xmm4, xmm3
+ shufps xmm3, xmm4, 10001100b
+ pxor xmm4, xmm3
+ pxor xmm4, xmm2
+%endmacro
+
+; Uses the f() function of the aeskeygenassist result
+%macro key_expansion_256_avx 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 11111111b
+ vshufps xmm3, xmm3, xmm1, 00010000b
+ vpxor xmm1, xmm1, xmm3
+ vshufps xmm3, xmm3, xmm1, 10001100b
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm2
+%endmacro
+
+; Uses the SubWord function of the aeskeygenassist result
+%macro key_expansion_256_avx_2 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 10101010b
+ vshufps xmm3, xmm3, xmm4, 00010000b
+ vpxor xmm4, xmm4, xmm3
+ vshufps xmm3, xmm3, xmm4, 10001100b
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm4, xmm4, xmm2
+%endmacro
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY rdi
+%define EXP_ENC_KEYS rsi
+%define EXP_DEC_KEYS rdx
+%else
+%define KEY rcx
+%define EXP_ENC_KEYS rdx
+%define EXP_DEC_KEYS r8
+%endif
+
+; void aes_keyexp_256(UINT8 *key,
+; UINT8 *enc_exp_keys,
+; UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8: pointer to expanded key array for decrypt
+;
+mk_global aes_keyexp_256_sse, function
+aes_keyexp_256_sse:
+ endbranch
+ movdqu xmm1, [KEY] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*0], xmm1
+ movdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory
+
+ movdqu xmm4, [KEY+16] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*1], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory
+
+ pxor xmm3, xmm3 ; Required for the key_expansion.
+
+ aeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*2], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*12], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*3], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*11], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*4], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*10], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*5], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*9], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*6], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*7], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*7], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*8], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*9], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*5], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*10], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*11], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*3], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*12], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*13], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*1], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*14], xmm1
+ movdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
+
+
+mk_global aes_keyexp_256_avx, function
+aes_keyexp_256_avx:
+ endbranch
+ vmovdqu xmm1, [KEY] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*0], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory
+
+ vmovdqu xmm4, [KEY+16] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*1], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory
+
+ vpxor xmm3, xmm3, xmm3 ; Required for the key_expansion.
+
+ vaeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*2], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*12], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*3], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*11], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*4], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*10], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*5], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*9], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*6], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*7], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*7], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*8], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*9], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*5], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*10], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*11], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*3], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*12], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*13], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*1], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*14], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm
new file mode 100644
index 000000000..045649a64
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm
@@ -0,0 +1,68 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern aes_keyexp_128_sse
+extern aes_keyexp_128_avx
+extern aes_keyexp_128_enc_sse
+extern aes_keyexp_128_enc_avx
+
+extern aes_keyexp_192_sse
+extern aes_keyexp_192_avx
+
+extern aes_keyexp_256_sse
+extern aes_keyexp_256_avx
+
+%include "multibinary.asm"
+
+
+;;;;
+; instantiate aes_keyexp_128 interfaces
+;;;;
+mbin_interface aes_keyexp_128
+mbin_dispatch_init aes_keyexp_128, aes_keyexp_128_sse, aes_keyexp_128_avx, aes_keyexp_128_avx
+
+mbin_interface aes_keyexp_128_enc
+mbin_dispatch_init aes_keyexp_128_enc, aes_keyexp_128_enc_sse, aes_keyexp_128_enc_avx, aes_keyexp_128_enc_avx
+
+mbin_interface aes_keyexp_192
+mbin_dispatch_init aes_keyexp_192, aes_keyexp_192_sse, aes_keyexp_192_avx, aes_keyexp_192_avx
+
+mbin_interface aes_keyexp_256
+mbin_dispatch_init aes_keyexp_256, aes_keyexp_256_sse, aes_keyexp_256_avx, aes_keyexp_256_avx
+
+section .text
+;;; func core, ver, snum
+slversion aes_keyexp_128, 00, 01, 02a1
+slversion aes_keyexp_192, 00, 01, 02a2
+slversion aes_keyexp_256, 00, 01, 02a3
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h
new file mode 100644
index 000000000..80c6e1e87
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h
@@ -0,0 +1,302 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_OSSL_HELPER_H_
+#define AES_OSSL_HELPER_H_
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+#include <openssl/evp.h>
+
+static inline
+ int openssl_aes_128_cbc_dec(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = 0;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_DecryptUpdate(ctx, plaintext, &outlen, (uint8_t const *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_DecryptFinal_ex(ctx, &plaintext[outlen], &tmplen))
+ printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_128_cbc %x, %x, %x\n", len,
+ outlen, tmplen);
+
+ EVP_CIPHER_CTX_free(ctx);
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_128_cbc_enc(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_EncryptUpdate
+ (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+
+ EVP_CIPHER_CTX_free(ctx);
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_192_cbc_dec(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = 0;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_192_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_192_cbc\n");
+ if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_192_cbc \n");
+
+ EVP_CIPHER_CTX_free(ctx);
+ return 0;
+}
+
+static inline
+ int openssl_aes_192_cbc_enc(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_192_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_EncryptUpdate
+ (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_192_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_192_cbc\n");
+
+ EVP_CIPHER_CTX_free(ctx);
+ return 0;
+}
+
+static inline
+ int openssl_aes_256_cbc_dec(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = 0;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_256_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_256_cbc\n");
+ if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_256_cbc %x,%x\n", outlen,
+ tmplen);
+
+ EVP_CIPHER_CTX_free(ctx);
+ return 0;
+}
+
+static inline
+ int openssl_aes_256_cbc_enc(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_256_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_EncryptUpdate
+ (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_256_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_256_cbc\n");
+
+ EVP_CIPHER_CTX_free(ctx);
+ return 0;
+}
+
+static inline
+ int openssl_aes_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext,
+ int len, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = len, ret;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - key init\n");
+ if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+
+ ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen);
+ if (0 < ret) {
+ tmplen += outlen;
+ } else {
+ //Authentication failed mismatched key, ADD or tag
+ tmplen = -1;
+ }
+
+ EVP_CIPHER_CTX_free(ctx);
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext,
+ int len, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ //printf("ivl:%x addl:%x tagl:%x ptl:%x\n", iv_len, aad_len, tag_len, len);
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - init\n");
+ if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n");
+ if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n");
+
+ EVP_CIPHER_CTX_free(ctx);
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_256_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext,
+ int len, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = len, ret;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - key init\n");
+ if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+ ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen);
+ if (0 < ret) {
+ tmplen += outlen;
+ } else {
+ //Authentication failed mismatched key, ADD or tag
+ tmplen = -1;
+ }
+
+ EVP_CIPHER_CTX_free(ctx);
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_256_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext,
+ int len, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - init\n");
+ if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n");
+ if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n");
+
+ EVP_CIPHER_CTX_free(ctx);
+ return tmplen;
+}
+
+#endif /* AES_OSSL_HELPER_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c
new file mode 100644
index 000000000..5dc898992
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ unsigned char *ct, unsigned char *dt)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, TEST_LEN))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt, *refdt;
+ unsigned char keyssl[32]; /* SSL takes both keys together */
+ struct perf start, stop;
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ printf("aes_xts_128_dec_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 16; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 16] = key2[i];
+ }
+
+ /* Encrypt and compare decrypted output */
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_128_xts_dec(ctx, keyssl, tinit, ct, refdt);
+ if (memcmp(dt, refdt, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ perf_stop(&stop);
+ printf("aes_xts_128_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_128_xts_dec(ctx, keyssl, tinit, ct, refdt);
+ perf_stop(&stop);
+ printf("aes_xts_128_openssl_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ EVP_CIPHER_CTX_free(ctx);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c
new file mode 100644
index 000000000..fdaa8a9bb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c
@@ -0,0 +1,125 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "aes_keyexp.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 3000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 400
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt;
+ uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11];
+ uint8_t expkey1_dec[16 * 11], null_key[16 * 11];
+
+ printf("aes_xts_128_dec_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /* Decode perf test */
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_128_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Expanded keys perf test */
+
+ aes_keyexp_128(key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_128(key2, expkey2_enc, null_key);
+ XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, pt);
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct,
+ pt);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_128_dec_expanded_key" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c
new file mode 100644
index 000000000..69ae2e60e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c
@@ -0,0 +1,144 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts128_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *refct;
+ struct perf start, stop;
+ unsigned char keyssl[32]; /* SSL takes both keys together */
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ printf("aes_xts_128_enc_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == refct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts128_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 16; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 16] = key2[i];
+ }
+
+ /* Encrypt and compare output */
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ if (memcmp(ct, refct, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ perf_stop(&stop);
+
+ printf("aes_xts_128_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ perf_stop(&stop);
+
+ printf("aes_xts_128_openssl_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ EVP_CIPHER_CTX_free(ctx);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c
new file mode 100644
index 000000000..166e46652
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c
@@ -0,0 +1,123 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "aes_keyexp.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 3000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 400
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct;
+ uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11];
+ uint8_t expkey1_dec[16 * 11], null_key[16 * 11];
+
+ printf("aes_xts_128_enc_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /* Encode perf test */
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_128_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Expanded keys perf test */
+
+ aes_keyexp_128(key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_128(key2, expkey2_enc, null_key);
+ XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, ct);
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt,
+ ct);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_128_enc_expanded_key" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c
new file mode 100644
index 000000000..27599f0ca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c
@@ -0,0 +1,116 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <aes_keyexp.h>
+#include "xts_128_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+ // Arrays for expanded keys, null_key is a dummy vector (decrypt key not
+ // needed for the tweak part of the decryption)
+ uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11];
+ uint8_t expkey1_dec[16 * 11], null_key[16 * 11];
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ printf("Can't allocate ciphertext memory\n");
+ return -1;
+ }
+ // Pre-expand keys (will only use the encryption ones here)
+ aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key);
+
+ XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ // Vectors 1-10 and 15-19 are for the 128 bit code
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ printf("Can't allocate plaintext memory\n");
+ return -1;
+ }
+ // Pre-expand keys for the decryption
+ aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key);
+
+ // Note, encryption key is re-used for the tweak decryption step
+ XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+ printf(" failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c
new file mode 100644
index 000000000..4753d6778
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c
@@ -0,0 +1,247 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_xts.h>
+#include <aes_keyexp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_SIZE (4096)
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int t, n;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ int align, size, min_size;
+ unsigned char *efence_pt;
+ unsigned char *efence_ct;
+ unsigned char *efence_dt;
+
+ unsigned char *origin_pt;
+ unsigned char *origin_ct;
+ unsigned char *origin_dt;
+
+ unsigned char key1_exp_enc[16 * 11], key1_exp_dec[16 * 11];
+ unsigned char key2_exp_tw[16 * 11];
+ int i;
+
+ printf("aes_xts_128 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN);
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ if (memcmp(pt, dt, TEST_LEN)) {
+ printf("fail\n");
+ return -1;
+ }
+ putchar('.');
+
+ // Do tests with random data, keys and message size
+ for (t = 0; t < RANDOMS; t++) {
+ n = rand() % (TEST_LEN);
+ if (n < 17)
+ continue;
+
+ mk_rand_data(key1, key2, tinit, pt, n);
+ XTS_AES_128_enc(key2, key1, tinit, n, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, n, ct, dt);
+
+ if (memcmp(pt, dt, n)) {
+ printf("fail rand %d, size %d\n", t, n);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // Run tests at end of buffer for Electric Fence
+ align = 1;
+ min_size = 16;
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ origin_pt = malloc(TEST_LEN);
+ origin_ct = malloc(TEST_LEN);
+ origin_dt = malloc(TEST_LEN);
+ if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ for (i = 0; i < 16 * 11; i++) {
+ key2_exp_tw[i] = rand();
+ }
+
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c
new file mode 100644
index 000000000..065b84465
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c
@@ -0,0 +1,271 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+#include <stdlib.h>
+#include <openssl/evp.h>
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS 128
+#endif
+#define TEST_LOOPS 128
+#define TEST_LEN (1024*1024)
+#define LENGTH_SCAN (2*1024)
+
+/* Generates random data for keys, tweak and plaintext */
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 128 encryption */
+static inline
+ int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv)
+ || (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ || (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))) {
+ printf("\n Error in openssl encoding of %d bytes\n", len);
+ return 1;
+ }
+ return 0;
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 128 decryption */
+static inline
+ int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *ct, unsigned char *dt)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv)
+ || (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len))
+ || (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))) {
+ printf("\n Error in openssl decoding of %d bytes\n", len);
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt, *refct, *refdt;
+ unsigned char keyssl[32]; /* SSL takes both keys together */
+ unsigned int rand_len, t;
+ int i, j, k, ret;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ /* Allocate space for input and output buffers */
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /**************************** LENGTH SCAN TEST *************************/
+ printf("aes_xts_128_rand_ossl test, %d sets of various length: ", 2 * 1024);
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 16; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 16] = key2[k];
+ }
+
+ for (ret = 0, i = 16; ret == 0 && i < LENGTH_SCAN; i++) {
+
+ /* Encrypt using each method */
+ XTS_AES_128_enc(key2, key1, tinit, i, pt, ct);
+ ret |= openssl_aes_128_xts_enc(ctx, keyssl, tinit, i, pt, refct);
+
+ // Compare
+ for (ret = 0, j = 0; j < i && ret == 0; j++) {
+ if (ct[j] != refct[j])
+ ret = 1;
+ }
+ if (ret)
+ printf(" XTS_AES_128_enc size=%d failed at byte %d!\n", i, j);
+
+ /* Decrypt using each method */
+ XTS_AES_128_dec(key2, key1, tinit, i, ct, dt);
+ ret |= openssl_aes_128_xts_dec(ctx, keyssl, tinit, i, refct, refdt);
+
+ for (k = 0, j = 0; j < TEST_LEN && ret == 0; j++) {
+ if (dt[j] != refdt[j])
+ ret = 1;
+ }
+ if (ret)
+ printf(" XTS_AES_128_dec size=%d failed at byte %d!\n", i, j);
+ if (0 == i % (LENGTH_SCAN / 16))
+ printf(".");
+ fflush(0);
+ }
+ if (ret)
+ return -1;
+ printf("Pass\n");
+
+ /**************************** FIXED LENGTH TEST *************************/
+ printf("aes_xts_128_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN);
+
+ // Loop over the vectors
+ for (i = 0; i < TEST_LOOPS; i++) {
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 16; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 16] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ if (openssl_aes_128_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct))
+ return -1;
+
+ /* Carry out comparison of the calculated ciphertext with
+ * the reference
+ */
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_128_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ if (openssl_aes_128_xts_dec(ctx, keyssl, tinit, TEST_LEN, refct, refdt))
+ return -1;
+
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_128_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ if (0 == i % (TEST_LOOPS / 16))
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ /**************************** RANDOM LENGTH TEST *************************/
+ printf("aes_xts_128_rand_ossl test, %d sets of random lengths: ", RANDOMS);
+
+ /* Run tests with random size */
+
+ for (t = 0; t < RANDOMS; t++) {
+
+ rand_len = rand() % (TEST_LEN);
+ rand_len = rand_len < 16 ? 16 : rand_len;
+ mk_rand_data(key1, key2, tinit, pt, rand_len);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 16; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 16] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_128_enc(key2, key1, tinit, rand_len, pt, ct);
+ if (openssl_aes_128_xts_enc(ctx, keyssl, tinit, rand_len, pt, refct))
+ return -1;
+
+ /* Carry out comparison of the calculated ciphertext with
+ * the reference
+ */
+ for (j = 0; j < rand_len; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_128_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_128_dec(key2, key1, tinit, rand_len, ct, dt);
+ if (openssl_aes_128_xts_dec(ctx, keyssl, tinit, rand_len, refct, refdt))
+ return -1;
+
+ for (j = 0; j < rand_len; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_128_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ if (0 == t % (RANDOMS / 16))
+ printf(".");
+ fflush(0);
+ }
+
+ EVP_CIPHER_CTX_free(ctx);
+
+ printf("Pass\n");
+
+ printf("aes_xts_128_rand_ossl: All tests passed\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c
new file mode 100644
index 000000000..5dd57e33c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c
@@ -0,0 +1,106 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "xts_128_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return -1;
+ }
+
+ XTS_AES_128_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ // Vectors 1-10 and 15-19 are for the 128 bit code
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return -1;
+ }
+
+ XTS_AES_128_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+
+ printf(" failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h
new file mode 100644
index 000000000..fce792dc7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h
@@ -0,0 +1,1691 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+
+#define NVEC 14
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+struct xts_vector {
+ uint64_t ptlen; // length of our plaintext
+ uint8_t *key1; // dimension 16 for 128 bit aes
+ uint8_t *key2; // dimension 16 for 128 bit aes
+ uint8_t *TW; // dimension 16 for both 128 and 256 bit
+ uint8_t *PTX; // min. dimension 16
+ uint8_t *CTX; // same dimension as PTX
+};
+
+/* Define our test vectors statically here. Test vectors are from the standard:
+ * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented
+ * Storage Devices"
+ * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450
+ *
+ * Vector 1
+ * Key1 00000000000000000000000000000000
+ * Key2 00000000000000000000000000000000
+ * Data Unit Sequence number 0
+ * PTX 0000000000000000000000000000000000000000000000000000000000000000 /128bit
+ * TWK 66e94bd4ef8a2c3b884cfa59ca342b2eccd297a8df1559761099f4b39469565c
+ * CTX 917cf69ebd68b2ec9b9fe9a3eadda692cd43d2f59598ed858c02c2652fbf922e
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v1_key1[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_key2[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_TW[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_PTX[32] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_CTX[32] = {
+ 0x91, 0x7c, 0xf6, 0x9e, 0xbd, 0x68, 0xb2, 0xec,
+ 0x9b, 0x9f, 0xe9, 0xa3, 0xea, 0xdd, 0xa6, 0x92,
+ 0xcd, 0x43, 0xd2, 0xf5, 0x95, 0x98, 0xed, 0x85,
+ 0x8c, 0x02, 0xc2, 0x65, 0x2f, 0xbf, 0x92, 0x2e
+};
+
+/*
+ * Vector 2
+ * Key1 11111111111111111111111111111111
+ * Key2 22222222222222222222222222222222
+ * Data Unit Sequence number 3333333333
+ * PTX 4444444444444444444444444444444444444444444444444444444444444444
+ * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d
+ * CTX c454185e6a16936e39334038acef838bfb186fff7480adc4289382ecd6d394f0
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v2_key1[16] = {
+ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11
+};
+
+static uint8_t v2_key2[16] = {
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22
+};
+
+static uint8_t v2_TW[16] = {
+ 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v2_PTX[32] = {
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44
+};
+
+static uint8_t v2_CTX[32] = {
+ 0xc4, 0x54, 0x18, 0x5e, 0x6a, 0x16, 0x93, 0x6e,
+ 0x39, 0x33, 0x40, 0x38, 0xac, 0xef, 0x83, 0x8b,
+ 0xfb, 0x18, 0x6f, 0xff, 0x74, 0x80, 0xad, 0xc4,
+ 0x28, 0x93, 0x82, 0xec, 0xd6, 0xd3, 0x94, 0xf0
+};
+
+/*
+ * Vector 3
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 22222222222222222222222222222222
+ * Data Unit Sequence number 3333333333
+ * PTX 4444444444444444444444444444444444444444444444444444444444444444
+ * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d
+ * CTX af85336b597afc1a900b2eb21ec949d292df4c047e0b21532186a5971a227a89
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v3_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v3_key2[16] = {
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22
+};
+
+static uint8_t v3_TW[16] = {
+ 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v3_PTX[32] = {
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44
+};
+
+static uint8_t v3_CTX[32] = {
+ 0xaf, 0x85, 0x33, 0x6b, 0x59, 0x7a, 0xfc, 0x1a,
+ 0x90, 0x0b, 0x2e, 0xb2, 0x1e, 0xc9, 0x49, 0xd2,
+ 0x92, 0xdf, 0x4c, 0x04, 0x7e, 0x0b, 0x21, 0x53,
+ 0x21, 0x86, 0xa5, 0x97, 0x1a, 0x22, 0x7a, 0x89
+};
+
+/*
+ * Vector 4
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence number 0
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c
+ * CTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412
+ * CTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce
+ * CTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265
+ * CTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8
+ * CTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434
+ * CTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c
+ * CTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e
+ * CTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc
+ * CTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3
+ * CTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344
+ * CTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd
+ * CTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752
+ * CTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e
+ * CTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d
+ * CTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v4_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v4_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v4_TW[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v4_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v4_CTX[512] = {
+ 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76,
+ 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2,
+ 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25,
+ 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c,
+ 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f,
+ 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00,
+ 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad,
+ 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12,
+ 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5,
+ 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5,
+ 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc,
+ 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce,
+ 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4,
+ 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84,
+ 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a,
+ 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65,
+ 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89,
+ 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51,
+ 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15,
+ 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8,
+ 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed,
+ 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91,
+ 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e,
+ 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34,
+ 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b,
+ 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5,
+ 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4,
+ 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c,
+ 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd,
+ 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3,
+ 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f,
+ 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e,
+ 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91,
+ 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19,
+ 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1,
+ 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc,
+ 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed,
+ 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde,
+ 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98,
+ 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3,
+ 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca,
+ 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6,
+ 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc,
+ 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44,
+ 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0,
+ 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95,
+ 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4,
+ 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd,
+ 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13,
+ 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7,
+ 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a,
+ 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52,
+ 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a,
+ 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38,
+ 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e,
+ 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e,
+ 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad,
+ 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8,
+ 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c,
+ 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d,
+ 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f,
+ 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2,
+ 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea,
+ 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68
+};
+
+/*
+ * Vector 5
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number 01
+ * PTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c
+ * PTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412
+ * PTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce
+ * PTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265
+ * PTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8
+ * PTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434
+ * PTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c
+ * PTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e
+ * PTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc
+ * PTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3
+ * PTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344
+ * PTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd
+ * PTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752
+ * PTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e
+ * PTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d
+ * PTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568
+ * CTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5
+ * CTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb
+ * CTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f
+ * CTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501
+ * CTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99
+ * CTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407
+ * CTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3
+ * CTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb
+ * CTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd
+ * CTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114
+ * CTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed
+ * CTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62
+ * CTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284
+ * CTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32
+ * CTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c
+ * CTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd
+ * Plaintext length (bytes): 512
+ */
+
+static uint8_t v5_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v5_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v5_TW[16] = {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v5_PTX[512] = {
+ 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76,
+ 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2,
+ 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25,
+ 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c,
+ 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f,
+ 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00,
+ 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad,
+ 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12,
+ 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5,
+ 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5,
+ 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc,
+ 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce,
+ 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4,
+ 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84,
+ 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a,
+ 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65,
+ 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89,
+ 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51,
+ 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15,
+ 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8,
+ 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed,
+ 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91,
+ 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e,
+ 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34,
+ 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b,
+ 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5,
+ 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4,
+ 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c,
+ 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd,
+ 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3,
+ 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f,
+ 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e,
+ 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91,
+ 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19,
+ 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1,
+ 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc,
+ 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed,
+ 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde,
+ 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98,
+ 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3,
+ 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca,
+ 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6,
+ 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc,
+ 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44,
+ 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0,
+ 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95,
+ 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4,
+ 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd,
+ 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13,
+ 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7,
+ 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a,
+ 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52,
+ 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a,
+ 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38,
+ 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e,
+ 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e,
+ 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad,
+ 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8,
+ 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c,
+ 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d,
+ 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f,
+ 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2,
+ 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea,
+ 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68
+};
+
+static uint8_t v5_CTX[512] = {
+ 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe,
+ 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f,
+ 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60,
+ 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5,
+ 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d,
+ 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce,
+ 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b,
+ 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb,
+ 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49,
+ 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3,
+ 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28,
+ 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f,
+ 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca,
+ 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2,
+ 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa,
+ 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01,
+ 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43,
+ 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d,
+ 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58,
+ 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99,
+ 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3,
+ 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f,
+ 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e,
+ 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07,
+ 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a,
+ 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52,
+ 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c,
+ 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3,
+ 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16,
+ 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b,
+ 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6,
+ 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb,
+ 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5,
+ 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0,
+ 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44,
+ 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd,
+ 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb,
+ 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1,
+ 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf,
+ 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14,
+ 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7,
+ 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d,
+ 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c,
+ 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed,
+ 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7,
+ 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97,
+ 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32,
+ 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62,
+ 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75,
+ 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce,
+ 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04,
+ 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84,
+ 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43,
+ 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95,
+ 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4,
+ 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32,
+ 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab,
+ 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2,
+ 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d,
+ 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c,
+ 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4,
+ 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d,
+ 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a,
+ 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd
+};
+
+/*
+ * Vector 6
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number 02
+ * PTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5
+ * PTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb
+ * PTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f
+ * PTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501
+ * PTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99
+ * PTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407
+ * PTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3
+ * PTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb
+ * PTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd
+ * PTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114
+ * PTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed
+ * PTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62
+ * PTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284
+ * PTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32
+ * PTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c
+ * PTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd
+ * CTX fa762a3680b76007928ed4a4f49a9456031b704782e65e16cecb54ed7d017b5e
+ * CTX 18abd67b338e81078f21edb7868d901ebe9c731a7c18b5e6dec1d6a72e078ac9
+ * CTX a4262f860beefa14f4e821018272e411a951502b6e79066e84252c3346f3aa62
+ * CTX 344351a291d4bedc7a07618bdea2af63145cc7a4b8d4070691ae890cd65733e7
+ * CTX 946e9021a1dffc4c59f159425ee6d50ca9b135fa6162cea18a939838dc000fb3
+ * CTX 86fad086acce5ac07cb2ece7fd580b00cfa5e98589631dc25e8e2a3daf2ffdec
+ * CTX 26531659912c9d8f7a15e5865ea8fb5816d6207052bd7128cd743c12c8118791
+ * CTX a4736811935eb982a532349e31dd401e0b660a568cb1a4711f552f55ded59f1f
+ * CTX 15bf7196b3ca12a91e488ef59d64f3a02bf45239499ac6176ae321c4a211ec54
+ * CTX 5365971c5d3f4f09d4eb139bfdf2073d33180b21002b65cc9865e76cb24cd92c
+ * CTX 874c24c18350399a936ab3637079295d76c417776b94efce3a0ef7206b151105
+ * CTX 19655c956cbd8b2489405ee2b09a6b6eebe0c53790a12a8998378b33a5b71159
+ * CTX 625f4ba49d2a2fdba59fbf0897bc7aabd8d707dc140a80f0f309f835d3da54ab
+ * CTX 584e501dfa0ee977fec543f74186a802b9a37adb3e8291eca04d66520d229e60
+ * CTX 401e7282bef486ae059aa70696e0e305d777140a7a883ecdcb69b9ff938e8a42
+ * CTX 31864c69ca2c2043bed007ff3e605e014bcf518138dc3a25c5e236171a2d01d6
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v6_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v6_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v6_TW[16] = {
+ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v6_PTX[512] = {
+
+ 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe,
+ 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f,
+ 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60,
+ 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5,
+ 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d,
+ 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce,
+ 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b,
+ 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb,
+ 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49,
+ 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3,
+ 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28,
+ 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f,
+ 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca,
+ 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2,
+ 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa,
+ 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01,
+ 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43,
+ 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d,
+ 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58,
+ 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99,
+ 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3,
+ 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f,
+ 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e,
+ 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07,
+ 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a,
+ 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52,
+ 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c,
+ 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3,
+ 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16,
+ 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b,
+ 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6,
+ 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb,
+ 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5,
+ 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0,
+ 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44,
+ 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd,
+ 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb,
+ 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1,
+ 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf,
+ 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14,
+ 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7,
+ 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d,
+ 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c,
+ 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed,
+ 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7,
+ 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97,
+ 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32,
+ 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62,
+ 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75,
+ 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce,
+ 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04,
+ 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84,
+ 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43,
+ 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95,
+ 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4,
+ 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32,
+ 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab,
+ 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2,
+ 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d,
+ 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c,
+ 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4,
+ 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d,
+ 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a,
+ 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd
+};
+
+static uint8_t v6_CTX[512] = {
+
+ 0xfa, 0x76, 0x2a, 0x36, 0x80, 0xb7, 0x60, 0x07,
+ 0x92, 0x8e, 0xd4, 0xa4, 0xf4, 0x9a, 0x94, 0x56,
+ 0x03, 0x1b, 0x70, 0x47, 0x82, 0xe6, 0x5e, 0x16,
+ 0xce, 0xcb, 0x54, 0xed, 0x7d, 0x01, 0x7b, 0x5e,
+ 0x18, 0xab, 0xd6, 0x7b, 0x33, 0x8e, 0x81, 0x07,
+ 0x8f, 0x21, 0xed, 0xb7, 0x86, 0x8d, 0x90, 0x1e,
+ 0xbe, 0x9c, 0x73, 0x1a, 0x7c, 0x18, 0xb5, 0xe6,
+ 0xde, 0xc1, 0xd6, 0xa7, 0x2e, 0x07, 0x8a, 0xc9,
+ 0xa4, 0x26, 0x2f, 0x86, 0x0b, 0xee, 0xfa, 0x14,
+ 0xf4, 0xe8, 0x21, 0x01, 0x82, 0x72, 0xe4, 0x11,
+ 0xa9, 0x51, 0x50, 0x2b, 0x6e, 0x79, 0x06, 0x6e,
+ 0x84, 0x25, 0x2c, 0x33, 0x46, 0xf3, 0xaa, 0x62,
+ 0x34, 0x43, 0x51, 0xa2, 0x91, 0xd4, 0xbe, 0xdc,
+ 0x7a, 0x07, 0x61, 0x8b, 0xde, 0xa2, 0xaf, 0x63,
+ 0x14, 0x5c, 0xc7, 0xa4, 0xb8, 0xd4, 0x07, 0x06,
+ 0x91, 0xae, 0x89, 0x0c, 0xd6, 0x57, 0x33, 0xe7,
+ 0x94, 0x6e, 0x90, 0x21, 0xa1, 0xdf, 0xfc, 0x4c,
+ 0x59, 0xf1, 0x59, 0x42, 0x5e, 0xe6, 0xd5, 0x0c,
+ 0xa9, 0xb1, 0x35, 0xfa, 0x61, 0x62, 0xce, 0xa1,
+ 0x8a, 0x93, 0x98, 0x38, 0xdc, 0x00, 0x0f, 0xb3,
+ 0x86, 0xfa, 0xd0, 0x86, 0xac, 0xce, 0x5a, 0xc0,
+ 0x7c, 0xb2, 0xec, 0xe7, 0xfd, 0x58, 0x0b, 0x00,
+ 0xcf, 0xa5, 0xe9, 0x85, 0x89, 0x63, 0x1d, 0xc2,
+ 0x5e, 0x8e, 0x2a, 0x3d, 0xaf, 0x2f, 0xfd, 0xec,
+ 0x26, 0x53, 0x16, 0x59, 0x91, 0x2c, 0x9d, 0x8f,
+ 0x7a, 0x15, 0xe5, 0x86, 0x5e, 0xa8, 0xfb, 0x58,
+ 0x16, 0xd6, 0x20, 0x70, 0x52, 0xbd, 0x71, 0x28,
+ 0xcd, 0x74, 0x3c, 0x12, 0xc8, 0x11, 0x87, 0x91,
+ 0xa4, 0x73, 0x68, 0x11, 0x93, 0x5e, 0xb9, 0x82,
+ 0xa5, 0x32, 0x34, 0x9e, 0x31, 0xdd, 0x40, 0x1e,
+ 0x0b, 0x66, 0x0a, 0x56, 0x8c, 0xb1, 0xa4, 0x71,
+ 0x1f, 0x55, 0x2f, 0x55, 0xde, 0xd5, 0x9f, 0x1f,
+ 0x15, 0xbf, 0x71, 0x96, 0xb3, 0xca, 0x12, 0xa9,
+ 0x1e, 0x48, 0x8e, 0xf5, 0x9d, 0x64, 0xf3, 0xa0,
+ 0x2b, 0xf4, 0x52, 0x39, 0x49, 0x9a, 0xc6, 0x17,
+ 0x6a, 0xe3, 0x21, 0xc4, 0xa2, 0x11, 0xec, 0x54,
+ 0x53, 0x65, 0x97, 0x1c, 0x5d, 0x3f, 0x4f, 0x09,
+ 0xd4, 0xeb, 0x13, 0x9b, 0xfd, 0xf2, 0x07, 0x3d,
+ 0x33, 0x18, 0x0b, 0x21, 0x00, 0x2b, 0x65, 0xcc,
+ 0x98, 0x65, 0xe7, 0x6c, 0xb2, 0x4c, 0xd9, 0x2c,
+ 0x87, 0x4c, 0x24, 0xc1, 0x83, 0x50, 0x39, 0x9a,
+ 0x93, 0x6a, 0xb3, 0x63, 0x70, 0x79, 0x29, 0x5d,
+ 0x76, 0xc4, 0x17, 0x77, 0x6b, 0x94, 0xef, 0xce,
+ 0x3a, 0x0e, 0xf7, 0x20, 0x6b, 0x15, 0x11, 0x05,
+ 0x19, 0x65, 0x5c, 0x95, 0x6c, 0xbd, 0x8b, 0x24,
+ 0x89, 0x40, 0x5e, 0xe2, 0xb0, 0x9a, 0x6b, 0x6e,
+ 0xeb, 0xe0, 0xc5, 0x37, 0x90, 0xa1, 0x2a, 0x89,
+ 0x98, 0x37, 0x8b, 0x33, 0xa5, 0xb7, 0x11, 0x59,
+ 0x62, 0x5f, 0x4b, 0xa4, 0x9d, 0x2a, 0x2f, 0xdb,
+ 0xa5, 0x9f, 0xbf, 0x08, 0x97, 0xbc, 0x7a, 0xab,
+ 0xd8, 0xd7, 0x07, 0xdc, 0x14, 0x0a, 0x80, 0xf0,
+ 0xf3, 0x09, 0xf8, 0x35, 0xd3, 0xda, 0x54, 0xab,
+ 0x58, 0x4e, 0x50, 0x1d, 0xfa, 0x0e, 0xe9, 0x77,
+ 0xfe, 0xc5, 0x43, 0xf7, 0x41, 0x86, 0xa8, 0x02,
+ 0xb9, 0xa3, 0x7a, 0xdb, 0x3e, 0x82, 0x91, 0xec,
+ 0xa0, 0x4d, 0x66, 0x52, 0x0d, 0x22, 0x9e, 0x60,
+ 0x40, 0x1e, 0x72, 0x82, 0xbe, 0xf4, 0x86, 0xae,
+ 0x05, 0x9a, 0xa7, 0x06, 0x96, 0xe0, 0xe3, 0x05,
+ 0xd7, 0x77, 0x14, 0x0a, 0x7a, 0x88, 0x3e, 0xcd,
+ 0xcb, 0x69, 0xb9, 0xff, 0x93, 0x8e, 0x8a, 0x42,
+ 0x31, 0x86, 0x4c, 0x69, 0xca, 0x2c, 0x20, 0x43,
+ 0xbe, 0xd0, 0x07, 0xff, 0x3e, 0x60, 0x5e, 0x01,
+ 0x4b, 0xcf, 0x51, 0x81, 0x38, 0xdc, 0x3a, 0x25,
+ 0xc5, 0xe2, 0x36, 0x17, 0x1a, 0x2d, 0x01, 0xd6
+};
+
+/*
+ * Vector 7
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number fd
+ * PTX 8e41b78c390b5af9d758bb214a67e9f6bf7727b09ac6124084c37611398fa45d
+ * PTX aad94868600ed391fb1acd4857a95b466e62ef9f4b377244d1c152e7b30d731a
+ * PTX ad30c716d214b707aed99eb5b5e580b3e887cf7497465651d4b60e6042051da3
+ * PTX 693c3b78c14489543be8b6ad0ba629565bba202313ba7b0d0c94a3252b676f46
+ * PTX cc02ce0f8a7d34c0ed229129673c1f61aed579d08a9203a25aac3a77e9db6026
+ * PTX 7996db38df637356d9dcd1632e369939f2a29d89345c66e05066f1a3677aef18
+ * PTX dea4113faeb629e46721a66d0a7e785d3e29af2594eb67dfa982affe0aac058f
+ * PTX 6e15864269b135418261fc3afb089472cf68c45dd7f231c6249ba0255e1e0338
+ * PTX 33fc4d00a3fe02132d7bc3873614b8aee34273581ea0325c81f0270affa13641
+ * PTX d052d36f0757d484014354d02d6883ca15c24d8c3956b1bd027bcf41f151fd80
+ * PTX 23c5340e5606f37e90fdb87c86fb4fa634b3718a30bace06a66eaf8f63c4aa3b
+ * PTX 637826a87fe8cfa44282e92cb1615af3a28e53bc74c7cba1a0977be9065d0c1a
+ * PTX 5dec6c54ae38d37f37aa35283e048e5530a85c4e7a29d7b92ec0c3169cdf2a80
+ * PTX 5c7604bce60049b9fb7b8eaac10f51ae23794ceba68bb58112e293b9b692ca72
+ * PTX 1b37c662f8574ed4dba6f88e170881c82cddc1034a0ca7e284bf0962b6b26292
+ * PTX d836fa9f73c1ac770eef0f2d3a1eaf61d3e03555fd424eedd67e18a18094f888
+ * CTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3
+ * CTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22
+ * CTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce
+ * CTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c
+ * CTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8
+ * CTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b
+ * CTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82
+ * CTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969
+ * CTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5
+ * CTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b
+ * CTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec
+ * CTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d
+ * CTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef
+ * CTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280
+ * CTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3
+ * CTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v7_key1[16] = {
+
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v7_key2[16] = {
+
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v7_TW[16] = {
+
+ 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v7_PTX[512] = {
+
+ 0x8e, 0x41, 0xb7, 0x8c, 0x39, 0x0b, 0x5a, 0xf9,
+ 0xd7, 0x58, 0xbb, 0x21, 0x4a, 0x67, 0xe9, 0xf6,
+ 0xbf, 0x77, 0x27, 0xb0, 0x9a, 0xc6, 0x12, 0x40,
+ 0x84, 0xc3, 0x76, 0x11, 0x39, 0x8f, 0xa4, 0x5d,
+ 0xaa, 0xd9, 0x48, 0x68, 0x60, 0x0e, 0xd3, 0x91,
+ 0xfb, 0x1a, 0xcd, 0x48, 0x57, 0xa9, 0x5b, 0x46,
+ 0x6e, 0x62, 0xef, 0x9f, 0x4b, 0x37, 0x72, 0x44,
+ 0xd1, 0xc1, 0x52, 0xe7, 0xb3, 0x0d, 0x73, 0x1a,
+ 0xad, 0x30, 0xc7, 0x16, 0xd2, 0x14, 0xb7, 0x07,
+ 0xae, 0xd9, 0x9e, 0xb5, 0xb5, 0xe5, 0x80, 0xb3,
+ 0xe8, 0x87, 0xcf, 0x74, 0x97, 0x46, 0x56, 0x51,
+ 0xd4, 0xb6, 0x0e, 0x60, 0x42, 0x05, 0x1d, 0xa3,
+ 0x69, 0x3c, 0x3b, 0x78, 0xc1, 0x44, 0x89, 0x54,
+ 0x3b, 0xe8, 0xb6, 0xad, 0x0b, 0xa6, 0x29, 0x56,
+ 0x5b, 0xba, 0x20, 0x23, 0x13, 0xba, 0x7b, 0x0d,
+ 0x0c, 0x94, 0xa3, 0x25, 0x2b, 0x67, 0x6f, 0x46,
+ 0xcc, 0x02, 0xce, 0x0f, 0x8a, 0x7d, 0x34, 0xc0,
+ 0xed, 0x22, 0x91, 0x29, 0x67, 0x3c, 0x1f, 0x61,
+ 0xae, 0xd5, 0x79, 0xd0, 0x8a, 0x92, 0x03, 0xa2,
+ 0x5a, 0xac, 0x3a, 0x77, 0xe9, 0xdb, 0x60, 0x26,
+ 0x79, 0x96, 0xdb, 0x38, 0xdf, 0x63, 0x73, 0x56,
+ 0xd9, 0xdc, 0xd1, 0x63, 0x2e, 0x36, 0x99, 0x39,
+ 0xf2, 0xa2, 0x9d, 0x89, 0x34, 0x5c, 0x66, 0xe0,
+ 0x50, 0x66, 0xf1, 0xa3, 0x67, 0x7a, 0xef, 0x18,
+ 0xde, 0xa4, 0x11, 0x3f, 0xae, 0xb6, 0x29, 0xe4,
+ 0x67, 0x21, 0xa6, 0x6d, 0x0a, 0x7e, 0x78, 0x5d,
+ 0x3e, 0x29, 0xaf, 0x25, 0x94, 0xeb, 0x67, 0xdf,
+ 0xa9, 0x82, 0xaf, 0xfe, 0x0a, 0xac, 0x05, 0x8f,
+ 0x6e, 0x15, 0x86, 0x42, 0x69, 0xb1, 0x35, 0x41,
+ 0x82, 0x61, 0xfc, 0x3a, 0xfb, 0x08, 0x94, 0x72,
+ 0xcf, 0x68, 0xc4, 0x5d, 0xd7, 0xf2, 0x31, 0xc6,
+ 0x24, 0x9b, 0xa0, 0x25, 0x5e, 0x1e, 0x03, 0x38,
+ 0x33, 0xfc, 0x4d, 0x00, 0xa3, 0xfe, 0x02, 0x13,
+ 0x2d, 0x7b, 0xc3, 0x87, 0x36, 0x14, 0xb8, 0xae,
+ 0xe3, 0x42, 0x73, 0x58, 0x1e, 0xa0, 0x32, 0x5c,
+ 0x81, 0xf0, 0x27, 0x0a, 0xff, 0xa1, 0x36, 0x41,
+ 0xd0, 0x52, 0xd3, 0x6f, 0x07, 0x57, 0xd4, 0x84,
+ 0x01, 0x43, 0x54, 0xd0, 0x2d, 0x68, 0x83, 0xca,
+ 0x15, 0xc2, 0x4d, 0x8c, 0x39, 0x56, 0xb1, 0xbd,
+ 0x02, 0x7b, 0xcf, 0x41, 0xf1, 0x51, 0xfd, 0x80,
+ 0x23, 0xc5, 0x34, 0x0e, 0x56, 0x06, 0xf3, 0x7e,
+ 0x90, 0xfd, 0xb8, 0x7c, 0x86, 0xfb, 0x4f, 0xa6,
+ 0x34, 0xb3, 0x71, 0x8a, 0x30, 0xba, 0xce, 0x06,
+ 0xa6, 0x6e, 0xaf, 0x8f, 0x63, 0xc4, 0xaa, 0x3b,
+ 0x63, 0x78, 0x26, 0xa8, 0x7f, 0xe8, 0xcf, 0xa4,
+ 0x42, 0x82, 0xe9, 0x2c, 0xb1, 0x61, 0x5a, 0xf3,
+ 0xa2, 0x8e, 0x53, 0xbc, 0x74, 0xc7, 0xcb, 0xa1,
+ 0xa0, 0x97, 0x7b, 0xe9, 0x06, 0x5d, 0x0c, 0x1a,
+ 0x5d, 0xec, 0x6c, 0x54, 0xae, 0x38, 0xd3, 0x7f,
+ 0x37, 0xaa, 0x35, 0x28, 0x3e, 0x04, 0x8e, 0x55,
+ 0x30, 0xa8, 0x5c, 0x4e, 0x7a, 0x29, 0xd7, 0xb9,
+ 0x2e, 0xc0, 0xc3, 0x16, 0x9c, 0xdf, 0x2a, 0x80,
+ 0x5c, 0x76, 0x04, 0xbc, 0xe6, 0x00, 0x49, 0xb9,
+ 0xfb, 0x7b, 0x8e, 0xaa, 0xc1, 0x0f, 0x51, 0xae,
+ 0x23, 0x79, 0x4c, 0xeb, 0xa6, 0x8b, 0xb5, 0x81,
+ 0x12, 0xe2, 0x93, 0xb9, 0xb6, 0x92, 0xca, 0x72,
+ 0x1b, 0x37, 0xc6, 0x62, 0xf8, 0x57, 0x4e, 0xd4,
+ 0xdb, 0xa6, 0xf8, 0x8e, 0x17, 0x08, 0x81, 0xc8,
+ 0x2c, 0xdd, 0xc1, 0x03, 0x4a, 0x0c, 0xa7, 0xe2,
+ 0x84, 0xbf, 0x09, 0x62, 0xb6, 0xb2, 0x62, 0x92,
+ 0xd8, 0x36, 0xfa, 0x9f, 0x73, 0xc1, 0xac, 0x77,
+ 0x0e, 0xef, 0x0f, 0x2d, 0x3a, 0x1e, 0xaf, 0x61,
+ 0xd3, 0xe0, 0x35, 0x55, 0xfd, 0x42, 0x4e, 0xed,
+ 0xd6, 0x7e, 0x18, 0xa1, 0x80, 0x94, 0xf8, 0x88
+};
+
+static uint8_t v7_CTX[512] = {
+
+ 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e,
+ 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a,
+ 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9,
+ 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3,
+ 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07,
+ 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3,
+ 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08,
+ 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22,
+ 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c,
+ 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f,
+ 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd,
+ 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce,
+ 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f,
+ 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5,
+ 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc,
+ 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c,
+ 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3,
+ 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b,
+ 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec,
+ 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8,
+ 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee,
+ 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b,
+ 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3,
+ 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b,
+ 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd,
+ 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c,
+ 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2,
+ 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82,
+ 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d,
+ 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd,
+ 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6,
+ 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69,
+ 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e,
+ 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7,
+ 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53,
+ 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5,
+ 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58,
+ 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32,
+ 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a,
+ 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b,
+ 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17,
+ 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45,
+ 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54,
+ 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec,
+ 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0,
+ 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39,
+ 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66,
+ 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d,
+ 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83,
+ 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31,
+ 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03,
+ 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef,
+ 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18,
+ 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1,
+ 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65,
+ 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80,
+ 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23,
+ 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f,
+ 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49,
+ 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3,
+ 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78,
+ 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1,
+ 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc,
+ 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37
+};
+
+/*
+ * Vector 8
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number fe
+ * PTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3
+ * PTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22
+ * PTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce
+ * PTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c
+ * PTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8
+ * PTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b
+ * PTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82
+ * PTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969
+ * PTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5
+ * PTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b
+ * PTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec
+ * PTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d
+ * PTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef
+ * PTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280
+ * PTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3
+ * PTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637
+ * CTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395
+ * CTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae
+ * CTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94
+ * CTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0
+ * CTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd
+ * CTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d
+ * CTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792
+ * CTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e
+ * CTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18
+ * CTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b
+ * CTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb
+ * CTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a
+ * CTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d
+ * CTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4
+ * CTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1
+ * CTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v8_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v8_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v8_TW[16] = {
+ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v8_PTX[512] = {
+ 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e,
+ 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a,
+ 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9,
+ 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3,
+ 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07,
+ 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3,
+ 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08,
+ 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22,
+ 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c,
+ 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f,
+ 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd,
+ 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce,
+ 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f,
+ 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5,
+ 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc,
+ 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c,
+ 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3,
+ 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b,
+ 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec,
+ 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8,
+ 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee,
+ 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b,
+ 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3,
+ 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b,
+ 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd,
+ 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c,
+ 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2,
+ 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82,
+ 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d,
+ 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd,
+ 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6,
+ 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69,
+ 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e,
+ 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7,
+ 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53,
+ 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5,
+ 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58,
+ 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32,
+ 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a,
+ 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b,
+ 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17,
+ 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45,
+ 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54,
+ 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec,
+ 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0,
+ 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39,
+ 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66,
+ 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d,
+ 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83,
+ 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31,
+ 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03,
+ 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef,
+ 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18,
+ 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1,
+ 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65,
+ 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80,
+ 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23,
+ 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f,
+ 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49,
+ 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3,
+ 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78,
+ 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1,
+ 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc,
+ 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37
+};
+
+static uint8_t v8_CTX[512] = {
+ 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59,
+ 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda,
+ 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44,
+ 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95,
+ 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1,
+ 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2,
+ 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e,
+ 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae,
+ 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e,
+ 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a,
+ 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27,
+ 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94,
+ 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c,
+ 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32,
+ 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18,
+ 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0,
+ 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d,
+ 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d,
+ 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf,
+ 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd,
+ 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1,
+ 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72,
+ 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb,
+ 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d,
+ 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6,
+ 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b,
+ 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58,
+ 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92,
+ 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf,
+ 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a,
+ 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29,
+ 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e,
+ 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28,
+ 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77,
+ 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87,
+ 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18,
+ 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1,
+ 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd,
+ 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67,
+ 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b,
+ 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde,
+ 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75,
+ 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a,
+ 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb,
+ 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32,
+ 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11,
+ 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65,
+ 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a,
+ 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca,
+ 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24,
+ 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e,
+ 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d,
+ 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05,
+ 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06,
+ 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9,
+ 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4,
+ 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11,
+ 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c,
+ 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85,
+ 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1,
+ 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11,
+ 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c,
+ 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef,
+ 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a
+};
+
+/*
+ * Vector 9
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number ff
+ * PTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395
+ * PTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae
+ * PTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94
+ * PTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0
+ * PTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd
+ * PTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d
+ * PTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792
+ * PTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e
+ * PTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18
+ * PTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b
+ * PTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb
+ * PTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a
+ * PTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d
+ * PTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4
+ * PTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1
+ * PTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a
+ * CTX 3260ae8dad1f4a32c5cafe3ab0eb95549d461a67ceb9e5aa2d3afb62dece0553
+ * CTX 193ba50c75be251e08d1d08f1088576c7efdfaaf3f459559571e12511753b07a
+ * CTX f073f35da06af0ce0bbf6b8f5ccc5cea500ec1b211bd51f63b606bf6528796ca
+ * CTX 12173ba39b8935ee44ccce646f90a45bf9ccc567f0ace13dc2d53ebeedc81f58
+ * CTX b2e41179dddf0d5a5c42f5d8506c1a5d2f8f59f3ea873cbcd0eec19acbf32542
+ * CTX 3bd3dcb8c2b1bf1d1eaed0eba7f0698e4314fbeb2f1566d1b9253008cbccf45a
+ * CTX 2b0d9c5c9c21474f4076e02be26050b99dee4fd68a4cf890e496e4fcae7b70f9
+ * CTX 4ea5a9062da0daeba1993d2ccd1dd3c244b8428801495a58b216547e7e847c46
+ * CTX d1d756377b6242d2e5fb83bf752b54e0df71e889f3a2bb0f4c10805bf3c59037
+ * CTX 6e3c24e22ff57f7fa965577375325cea5d920db94b9c336b455f6e894c01866f
+ * CTX e9fbb8c8d3f70a2957285f6dfb5dcd8cbf54782f8fe7766d4723819913ac7734
+ * CTX 21e3a31095866bad22c86a6036b2518b2059b4229d18c8c2ccbdf906c6cc6e82
+ * CTX 464ee57bddb0bebcb1dc645325bfb3e665ef7251082c88ebb1cf203bd779fdd3
+ * CTX 8675713c8daadd17e1cabee432b09787b6ddf3304e38b731b45df5df51b78fcf
+ * CTX b3d32466028d0ba36555e7e11ab0ee0666061d1645d962444bc47a38188930a8
+ * CTX 4b4d561395c73c087021927ca638b7afc8a8679ccb84c26555440ec7f10445cd
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v9_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v9_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v9_TW[16] = {
+ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v9_PTX[512] = {
+ 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59,
+ 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda,
+ 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44,
+ 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95,
+ 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1,
+ 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2,
+ 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e,
+ 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae,
+ 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e,
+ 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a,
+ 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27,
+ 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94,
+ 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c,
+ 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32,
+ 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18,
+ 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0,
+ 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d,
+ 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d,
+ 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf,
+ 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd,
+ 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1,
+ 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72,
+ 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb,
+ 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d,
+ 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6,
+ 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b,
+ 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58,
+ 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92,
+ 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf,
+ 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a,
+ 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29,
+ 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e,
+ 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28,
+ 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77,
+ 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87,
+ 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18,
+ 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1,
+ 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd,
+ 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67,
+ 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b,
+ 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde,
+ 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75,
+ 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a,
+ 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb,
+ 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32,
+ 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11,
+ 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65,
+ 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a,
+ 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca,
+ 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24,
+ 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e,
+ 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d,
+ 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05,
+ 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06,
+ 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9,
+ 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4,
+ 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11,
+ 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c,
+ 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85,
+ 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1,
+ 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11,
+ 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c,
+ 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef,
+ 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a
+};
+
+static uint8_t v9_CTX[512] = {
+ 0x32, 0x60, 0xae, 0x8d, 0xad, 0x1f, 0x4a, 0x32,
+ 0xc5, 0xca, 0xfe, 0x3a, 0xb0, 0xeb, 0x95, 0x54,
+ 0x9d, 0x46, 0x1a, 0x67, 0xce, 0xb9, 0xe5, 0xaa,
+ 0x2d, 0x3a, 0xfb, 0x62, 0xde, 0xce, 0x05, 0x53,
+ 0x19, 0x3b, 0xa5, 0x0c, 0x75, 0xbe, 0x25, 0x1e,
+ 0x08, 0xd1, 0xd0, 0x8f, 0x10, 0x88, 0x57, 0x6c,
+ 0x7e, 0xfd, 0xfa, 0xaf, 0x3f, 0x45, 0x95, 0x59,
+ 0x57, 0x1e, 0x12, 0x51, 0x17, 0x53, 0xb0, 0x7a,
+ 0xf0, 0x73, 0xf3, 0x5d, 0xa0, 0x6a, 0xf0, 0xce,
+ 0x0b, 0xbf, 0x6b, 0x8f, 0x5c, 0xcc, 0x5c, 0xea,
+ 0x50, 0x0e, 0xc1, 0xb2, 0x11, 0xbd, 0x51, 0xf6,
+ 0x3b, 0x60, 0x6b, 0xf6, 0x52, 0x87, 0x96, 0xca,
+ 0x12, 0x17, 0x3b, 0xa3, 0x9b, 0x89, 0x35, 0xee,
+ 0x44, 0xcc, 0xce, 0x64, 0x6f, 0x90, 0xa4, 0x5b,
+ 0xf9, 0xcc, 0xc5, 0x67, 0xf0, 0xac, 0xe1, 0x3d,
+ 0xc2, 0xd5, 0x3e, 0xbe, 0xed, 0xc8, 0x1f, 0x58,
+ 0xb2, 0xe4, 0x11, 0x79, 0xdd, 0xdf, 0x0d, 0x5a,
+ 0x5c, 0x42, 0xf5, 0xd8, 0x50, 0x6c, 0x1a, 0x5d,
+ 0x2f, 0x8f, 0x59, 0xf3, 0xea, 0x87, 0x3c, 0xbc,
+ 0xd0, 0xee, 0xc1, 0x9a, 0xcb, 0xf3, 0x25, 0x42,
+ 0x3b, 0xd3, 0xdc, 0xb8, 0xc2, 0xb1, 0xbf, 0x1d,
+ 0x1e, 0xae, 0xd0, 0xeb, 0xa7, 0xf0, 0x69, 0x8e,
+ 0x43, 0x14, 0xfb, 0xeb, 0x2f, 0x15, 0x66, 0xd1,
+ 0xb9, 0x25, 0x30, 0x08, 0xcb, 0xcc, 0xf4, 0x5a,
+ 0x2b, 0x0d, 0x9c, 0x5c, 0x9c, 0x21, 0x47, 0x4f,
+ 0x40, 0x76, 0xe0, 0x2b, 0xe2, 0x60, 0x50, 0xb9,
+ 0x9d, 0xee, 0x4f, 0xd6, 0x8a, 0x4c, 0xf8, 0x90,
+ 0xe4, 0x96, 0xe4, 0xfc, 0xae, 0x7b, 0x70, 0xf9,
+ 0x4e, 0xa5, 0xa9, 0x06, 0x2d, 0xa0, 0xda, 0xeb,
+ 0xa1, 0x99, 0x3d, 0x2c, 0xcd, 0x1d, 0xd3, 0xc2,
+ 0x44, 0xb8, 0x42, 0x88, 0x01, 0x49, 0x5a, 0x58,
+ 0xb2, 0x16, 0x54, 0x7e, 0x7e, 0x84, 0x7c, 0x46,
+ 0xd1, 0xd7, 0x56, 0x37, 0x7b, 0x62, 0x42, 0xd2,
+ 0xe5, 0xfb, 0x83, 0xbf, 0x75, 0x2b, 0x54, 0xe0,
+ 0xdf, 0x71, 0xe8, 0x89, 0xf3, 0xa2, 0xbb, 0x0f,
+ 0x4c, 0x10, 0x80, 0x5b, 0xf3, 0xc5, 0x90, 0x37,
+ 0x6e, 0x3c, 0x24, 0xe2, 0x2f, 0xf5, 0x7f, 0x7f,
+ 0xa9, 0x65, 0x57, 0x73, 0x75, 0x32, 0x5c, 0xea,
+ 0x5d, 0x92, 0x0d, 0xb9, 0x4b, 0x9c, 0x33, 0x6b,
+ 0x45, 0x5f, 0x6e, 0x89, 0x4c, 0x01, 0x86, 0x6f,
+ 0xe9, 0xfb, 0xb8, 0xc8, 0xd3, 0xf7, 0x0a, 0x29,
+ 0x57, 0x28, 0x5f, 0x6d, 0xfb, 0x5d, 0xcd, 0x8c,
+ 0xbf, 0x54, 0x78, 0x2f, 0x8f, 0xe7, 0x76, 0x6d,
+ 0x47, 0x23, 0x81, 0x99, 0x13, 0xac, 0x77, 0x34,
+ 0x21, 0xe3, 0xa3, 0x10, 0x95, 0x86, 0x6b, 0xad,
+ 0x22, 0xc8, 0x6a, 0x60, 0x36, 0xb2, 0x51, 0x8b,
+ 0x20, 0x59, 0xb4, 0x22, 0x9d, 0x18, 0xc8, 0xc2,
+ 0xcc, 0xbd, 0xf9, 0x06, 0xc6, 0xcc, 0x6e, 0x82,
+ 0x46, 0x4e, 0xe5, 0x7b, 0xdd, 0xb0, 0xbe, 0xbc,
+ 0xb1, 0xdc, 0x64, 0x53, 0x25, 0xbf, 0xb3, 0xe6,
+ 0x65, 0xef, 0x72, 0x51, 0x08, 0x2c, 0x88, 0xeb,
+ 0xb1, 0xcf, 0x20, 0x3b, 0xd7, 0x79, 0xfd, 0xd3,
+ 0x86, 0x75, 0x71, 0x3c, 0x8d, 0xaa, 0xdd, 0x17,
+ 0xe1, 0xca, 0xbe, 0xe4, 0x32, 0xb0, 0x97, 0x87,
+ 0xb6, 0xdd, 0xf3, 0x30, 0x4e, 0x38, 0xb7, 0x31,
+ 0xb4, 0x5d, 0xf5, 0xdf, 0x51, 0xb7, 0x8f, 0xcf,
+ 0xb3, 0xd3, 0x24, 0x66, 0x02, 0x8d, 0x0b, 0xa3,
+ 0x65, 0x55, 0xe7, 0xe1, 0x1a, 0xb0, 0xee, 0x06,
+ 0x66, 0x06, 0x1d, 0x16, 0x45, 0xd9, 0x62, 0x44,
+ 0x4b, 0xc4, 0x7a, 0x38, 0x18, 0x89, 0x30, 0xa8,
+ 0x4b, 0x4d, 0x56, 0x13, 0x95, 0xc7, 0x3c, 0x08,
+ 0x70, 0x21, 0x92, 0x7c, 0xa6, 0x38, 0xb7, 0xaf,
+ 0xc8, 0xa8, 0x67, 0x9c, 0xcb, 0x84, 0xc2, 0x65,
+ 0x55, 0x44, 0x0e, 0xc7, 0xf1, 0x04, 0x45, 0xcd
+};
+
+/*
+ * Vector 15
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f10
+ * CTX 6c1625db4671522d3d7599601de7ca09ed
+ * Plaintext length (bytes): 17
+ */
+
+static uint8_t v15_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v15_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v15_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v15_PTX[17] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10
+};
+
+static uint8_t v15_CTX[17] = {
+ 0x6c, 0x16, 0x25, 0xdb, 0x46, 0x71, 0x52, 0x2d,
+ 0x3d, 0x75, 0x99, 0x60, 0x1d, 0xe7, 0xca, 0x09,
+ 0xed
+};
+
+/*
+ * Vector 16
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f1011
+ * CTX d069444b7a7e0cab09e24447d24deb1fedbf
+ * Plaintext length (bytes): 18
+ */
+static uint8_t v16_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v16_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v16_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v16_PTX[18] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11
+};
+
+static uint8_t v16_CTX[18] = {
+ 0xd0, 0x69, 0x44, 0x4b, 0x7a, 0x7e, 0x0c, 0xab,
+ 0x09, 0xe2, 0x44, 0x47, 0xd2, 0x4d, 0xeb, 0x1f,
+ 0xed, 0xbf
+};
+
+/*
+ * Vector 17
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f101112
+ * CTX e5df1351c0544ba1350b3363cd8ef4beedbf9d
+ * Plaintext length (bytes): 19
+ */
+
+static uint8_t v17_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v17_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v17_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v17_PTX[19] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12
+};
+
+static uint8_t v17_CTX[19] = {
+ 0xe5, 0xdf, 0x13, 0x51, 0xc0, 0x54, 0x4b, 0xa1,
+ 0x35, 0x0b, 0x33, 0x63, 0xcd, 0x8e, 0xf4, 0xbe,
+ 0xed, 0xbf, 0x9d
+};
+
+/*
+ * Vector 18
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f10111213
+ * CTX 9d84c813f719aa2c7be3f66171c7c5c2edbf9dac
+ * Plaintext length (bytes): 20
+ */
+
+static uint8_t v18_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v18_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v18_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v18_PTX[20] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13
+};
+
+static uint8_t v18_CTX[20] = {
+ 0x9d, 0x84, 0xc8, 0x13, 0xf7, 0x19, 0xaa, 0x2c,
+ 0x7b, 0xe3, 0xf6, 0x61, 0x71, 0xc7, 0xc5, 0xc2,
+ 0xed, 0xbf, 0x9d, 0xac
+};
+
+/*
+ * Vector 19
+ * Key1 e0e1e2e3e4e5e6e7e8e9eaebecedeeef
+ * Key2 c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+ * Data unit sequence number 21436587a9
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 38b45812ef43a05bd957e545907e223b954ab4aaf088303ad910eadf14b42be6
+ * CTX 8b2461149d8c8ba85f992be970bc621f1b06573f63e867bf5875acafa04e42cc
+ * CTX bd7bd3c2a0fb1fff791ec5ec36c66ae4ac1e806d81fbf709dbe29e471fad3854
+ * CTX 9c8e66f5345d7c1eb94f405d1ec785cc6f6a68f6254dd8339f9d84057e01a177
+ * CTX 41990482999516b5611a38f41bb6478e6f173f320805dd71b1932fc333cb9ee3
+ * CTX 9936beea9ad96fa10fb4112b901734ddad40bc1878995f8e11aee7d141a2f5d4
+ * CTX 8b7a4e1e7f0b2c04830e69a4fd1378411c2f287edf48c6c4e5c247a19680f7fe
+ * CTX 41cefbd49b582106e3616cbbe4dfb2344b2ae9519391f3e0fb4922254b1d6d2d
+ * CTX 19c6d4d537b3a26f3bcc51588b32f3eca0829b6a5ac72578fb814fb43cf80d64
+ * CTX a233e3f997a3f02683342f2b33d25b492536b93becb2f5e1a8b82f5b88334272
+ * CTX 9e8ae09d16938841a21a97fb543eea3bbff59f13c1a18449e398701c1ad51648
+ * CTX 346cbc04c27bb2da3b93a1372ccae548fb53bee476f9e9c91773b1bb19828394
+ * CTX d55d3e1a20ed69113a860b6829ffa847224604435070221b257e8dff783615d2
+ * CTX cae4803a93aa4334ab482a0afac9c0aeda70b45a481df5dec5df8cc0f423c77a
+ * CTX 5fd46cd312021d4b438862419a791be03bb4d97c0e59578542531ba466a83baf
+ * CTX 92cefc151b5cc1611a167893819b63fb8a6b18e86de60290fa72b797b0ce59f3
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v19_key1[16] = {
+
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef
+};
+
+static uint8_t v19_key2[16] = {
+
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf
+};
+
+static uint8_t v19_TW[16] = {
+
+ 0x21, 0x43, 0x65, 0x87, 0xa9, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v19_PTX[512] = {
+
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v19_CTX[512] = {
+ 0x38, 0xb4, 0x58, 0x12, 0xef, 0x43, 0xa0, 0x5b,
+ 0xd9, 0x57, 0xe5, 0x45, 0x90, 0x7e, 0x22, 0x3b,
+ 0x95, 0x4a, 0xb4, 0xaa, 0xf0, 0x88, 0x30, 0x3a,
+ 0xd9, 0x10, 0xea, 0xdf, 0x14, 0xb4, 0x2b, 0xe6,
+ 0x8b, 0x24, 0x61, 0x14, 0x9d, 0x8c, 0x8b, 0xa8,
+ 0x5f, 0x99, 0x2b, 0xe9, 0x70, 0xbc, 0x62, 0x1f,
+ 0x1b, 0x06, 0x57, 0x3f, 0x63, 0xe8, 0x67, 0xbf,
+ 0x58, 0x75, 0xac, 0xaf, 0xa0, 0x4e, 0x42, 0xcc,
+ 0xbd, 0x7b, 0xd3, 0xc2, 0xa0, 0xfb, 0x1f, 0xff,
+ 0x79, 0x1e, 0xc5, 0xec, 0x36, 0xc6, 0x6a, 0xe4,
+ 0xac, 0x1e, 0x80, 0x6d, 0x81, 0xfb, 0xf7, 0x09,
+ 0xdb, 0xe2, 0x9e, 0x47, 0x1f, 0xad, 0x38, 0x54,
+ 0x9c, 0x8e, 0x66, 0xf5, 0x34, 0x5d, 0x7c, 0x1e,
+ 0xb9, 0x4f, 0x40, 0x5d, 0x1e, 0xc7, 0x85, 0xcc,
+ 0x6f, 0x6a, 0x68, 0xf6, 0x25, 0x4d, 0xd8, 0x33,
+ 0x9f, 0x9d, 0x84, 0x05, 0x7e, 0x01, 0xa1, 0x77,
+ 0x41, 0x99, 0x04, 0x82, 0x99, 0x95, 0x16, 0xb5,
+ 0x61, 0x1a, 0x38, 0xf4, 0x1b, 0xb6, 0x47, 0x8e,
+ 0x6f, 0x17, 0x3f, 0x32, 0x08, 0x05, 0xdd, 0x71,
+ 0xb1, 0x93, 0x2f, 0xc3, 0x33, 0xcb, 0x9e, 0xe3,
+ 0x99, 0x36, 0xbe, 0xea, 0x9a, 0xd9, 0x6f, 0xa1,
+ 0x0f, 0xb4, 0x11, 0x2b, 0x90, 0x17, 0x34, 0xdd,
+ 0xad, 0x40, 0xbc, 0x18, 0x78, 0x99, 0x5f, 0x8e,
+ 0x11, 0xae, 0xe7, 0xd1, 0x41, 0xa2, 0xf5, 0xd4,
+ 0x8b, 0x7a, 0x4e, 0x1e, 0x7f, 0x0b, 0x2c, 0x04,
+ 0x83, 0x0e, 0x69, 0xa4, 0xfd, 0x13, 0x78, 0x41,
+ 0x1c, 0x2f, 0x28, 0x7e, 0xdf, 0x48, 0xc6, 0xc4,
+ 0xe5, 0xc2, 0x47, 0xa1, 0x96, 0x80, 0xf7, 0xfe,
+ 0x41, 0xce, 0xfb, 0xd4, 0x9b, 0x58, 0x21, 0x06,
+ 0xe3, 0x61, 0x6c, 0xbb, 0xe4, 0xdf, 0xb2, 0x34,
+ 0x4b, 0x2a, 0xe9, 0x51, 0x93, 0x91, 0xf3, 0xe0,
+ 0xfb, 0x49, 0x22, 0x25, 0x4b, 0x1d, 0x6d, 0x2d,
+ 0x19, 0xc6, 0xd4, 0xd5, 0x37, 0xb3, 0xa2, 0x6f,
+ 0x3b, 0xcc, 0x51, 0x58, 0x8b, 0x32, 0xf3, 0xec,
+ 0xa0, 0x82, 0x9b, 0x6a, 0x5a, 0xc7, 0x25, 0x78,
+ 0xfb, 0x81, 0x4f, 0xb4, 0x3c, 0xf8, 0x0d, 0x64,
+ 0xa2, 0x33, 0xe3, 0xf9, 0x97, 0xa3, 0xf0, 0x26,
+ 0x83, 0x34, 0x2f, 0x2b, 0x33, 0xd2, 0x5b, 0x49,
+ 0x25, 0x36, 0xb9, 0x3b, 0xec, 0xb2, 0xf5, 0xe1,
+ 0xa8, 0xb8, 0x2f, 0x5b, 0x88, 0x33, 0x42, 0x72,
+ 0x9e, 0x8a, 0xe0, 0x9d, 0x16, 0x93, 0x88, 0x41,
+ 0xa2, 0x1a, 0x97, 0xfb, 0x54, 0x3e, 0xea, 0x3b,
+ 0xbf, 0xf5, 0x9f, 0x13, 0xc1, 0xa1, 0x84, 0x49,
+ 0xe3, 0x98, 0x70, 0x1c, 0x1a, 0xd5, 0x16, 0x48,
+ 0x34, 0x6c, 0xbc, 0x04, 0xc2, 0x7b, 0xb2, 0xda,
+ 0x3b, 0x93, 0xa1, 0x37, 0x2c, 0xca, 0xe5, 0x48,
+ 0xfb, 0x53, 0xbe, 0xe4, 0x76, 0xf9, 0xe9, 0xc9,
+ 0x17, 0x73, 0xb1, 0xbb, 0x19, 0x82, 0x83, 0x94,
+ 0xd5, 0x5d, 0x3e, 0x1a, 0x20, 0xed, 0x69, 0x11,
+ 0x3a, 0x86, 0x0b, 0x68, 0x29, 0xff, 0xa8, 0x47,
+ 0x22, 0x46, 0x04, 0x43, 0x50, 0x70, 0x22, 0x1b,
+ 0x25, 0x7e, 0x8d, 0xff, 0x78, 0x36, 0x15, 0xd2,
+ 0xca, 0xe4, 0x80, 0x3a, 0x93, 0xaa, 0x43, 0x34,
+ 0xab, 0x48, 0x2a, 0x0a, 0xfa, 0xc9, 0xc0, 0xae,
+ 0xda, 0x70, 0xb4, 0x5a, 0x48, 0x1d, 0xf5, 0xde,
+ 0xc5, 0xdf, 0x8c, 0xc0, 0xf4, 0x23, 0xc7, 0x7a,
+ 0x5f, 0xd4, 0x6c, 0xd3, 0x12, 0x02, 0x1d, 0x4b,
+ 0x43, 0x88, 0x62, 0x41, 0x9a, 0x79, 0x1b, 0xe0,
+ 0x3b, 0xb4, 0xd9, 0x7c, 0x0e, 0x59, 0x57, 0x85,
+ 0x42, 0x53, 0x1b, 0xa4, 0x66, 0xa8, 0x3b, 0xaf,
+ 0x92, 0xce, 0xfc, 0x15, 0x1b, 0x5c, 0xc1, 0x61,
+ 0x1a, 0x16, 0x78, 0x93, 0x81, 0x9b, 0x63, 0xfb,
+ 0x8a, 0x6b, 0x18, 0xe8, 0x6d, 0xe6, 0x02, 0x90,
+ 0xfa, 0x72, 0xb7, 0x97, 0xb0, 0xce, 0x59, 0xf3
+};
+
+// Define vector of structs, with pointers to the statically defined vectors
+
+struct xts_vector vlist[NVEC] = {
+
+ // pointers to the statically defined vectors here
+
+ // Vector 1
+ {sizeof(v1_CTX), v1_key1, v1_key2, v1_TW, v1_PTX, v1_CTX}
+ ,
+ // Vector 2
+ {sizeof(v2_CTX), v2_key1, v2_key2, v2_TW, v2_PTX, v2_CTX}
+ ,
+ // Vector 3
+ {sizeof(v3_CTX), v3_key1, v3_key2, v3_TW, v3_PTX, v3_CTX}
+ ,
+ // Vector 4
+ {sizeof(v4_CTX), v4_key1, v4_key2, v4_TW, v4_PTX, v4_CTX}
+ ,
+ // Vector 5
+ {sizeof(v5_CTX), v5_key1, v5_key2, v5_TW, v5_PTX, v5_CTX}
+ ,
+ // Vector 6
+ {sizeof(v6_CTX), v6_key1, v6_key2, v6_TW, v6_PTX, v6_CTX}
+ ,
+ // Vector 7
+ {sizeof(v7_CTX), v7_key1, v7_key2, v7_TW, v7_PTX, v7_CTX}
+ ,
+ // Vector 8
+ {sizeof(v8_CTX), v8_key1, v8_key2, v8_TW, v8_PTX, v8_CTX}
+ ,
+ // Vector 9
+ {sizeof(v9_CTX), v9_key1, v9_key2, v9_TW, v9_PTX, v9_CTX}
+ ,
+ // Vector 15
+ {sizeof(v15_CTX), v15_key1, v15_key2, v15_TW, v15_PTX, v15_CTX}
+ ,
+ // Vector 16
+ {sizeof(v16_CTX), v16_key1, v16_key2, v16_TW, v16_PTX, v16_CTX}
+ ,
+ // Vector 17
+ {sizeof(v17_CTX), v17_key1, v17_key2, v17_TW, v17_PTX, v17_CTX}
+ ,
+ // Vector 18
+ {sizeof(v18_CTX), v18_key1, v18_key2, v18_TW, v18_PTX, v18_CTX}
+ ,
+ // Vector 19
+ {sizeof(v19_CTX), v19_key1, v19_key2, v19_TW, v19_PTX, v19_CTX}
+
+};
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c
new file mode 100644
index 000000000..5bccd4a5c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c
@@ -0,0 +1,145 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *dt, *refdt;
+ struct perf start, stop;
+ unsigned char keyssl[64]; /* SSL takes both keys together */
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ printf("aes_xts_256_dec_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 32; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 32] = key2[i];
+ }
+
+ /* Encrypt and compare decrypted output */
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, ct, refdt);
+ if (memcmp(dt, refdt, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ perf_stop(&stop);
+ printf("aes_xts_256_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, ct, refdt);
+ perf_stop(&stop);
+ printf("aes_xts_256_openssl_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ EVP_CIPHER_CTX_free(ctx);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c
new file mode 100644
index 000000000..ff3d62e93
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c
@@ -0,0 +1,126 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "aes_keyexp.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 3000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 400
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *dt;
+ uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15];
+ uint8_t expkey1_dec[16 * 15], null_key[16 * 15];
+
+ printf("aes_xts_256_dec_perf:\n");
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /* Decode perf test */
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_256_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Expanded keys perf test */
+
+ aes_keyexp_256(key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_256(key2, expkey2_enc, null_key);
+ XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct, pt);
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, tinit, TEST_LEN, ct,
+ pt);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_256_dec_expanded_key" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c
new file mode 100644
index 000000000..8d477ca89
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c
@@ -0,0 +1,145 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *refct;
+ struct perf start, stop;
+ unsigned char keyssl[64]; /* SSL takes both keys together */
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ printf("aes_xts_256_enc_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == refct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 32; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 32] = key2[i];
+ }
+
+ /* Encrypt and compare output */
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ if (memcmp(ct, refct, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ perf_stop(&stop);
+
+ printf("aes_xts_256_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ perf_stop(&stop);
+
+ printf("aes_xts_256_ossl_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ EVP_CIPHER_CTX_free(ctx);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c
new file mode 100644
index 000000000..051dd0a0e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c
@@ -0,0 +1,124 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "aes_keyexp.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 3000000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 400
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct;
+ uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15];
+ uint8_t expkey1_dec[16 * 15], null_key[16 * 15];
+
+ printf("aes_xts_256_enc_perf:\n");
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /* Encode perf test */
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_256_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Expanded keys perf test */
+
+ aes_keyexp_256(key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_256(key2, expkey2_enc, null_key);
+ XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt, ct);
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, tinit, TEST_LEN, pt,
+ ct);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_256_enc_expanded_key" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c
new file mode 100644
index 000000000..c8d664a8b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c
@@ -0,0 +1,113 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <aes_keyexp.h>
+#include "xts_256_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+ // Arrays for expanded keys, null_key is a dummy vector (decrypt key not
+ // needed for the tweak part of the decryption)
+ uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15];
+ uint8_t expkey1_dec[16 * 15], null_key[16 * 15];
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ printf("Can't allocate ciphertext memory\n");
+ return -1;
+ }
+ // Pre-expand our keys (will only use the encryption ones here)
+ aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key);
+
+ XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ printf("\nXTS_AES_256_enc: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated plaintext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ printf("Can't allocate plaintext memory\n");
+ return -1;
+ }
+ // Pre-expand keys for the decryption
+ aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key);
+
+ // Note, encryption key is re-used for the tweak decryption step
+ XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ printf("\nXTS_AES_256_dec: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c
new file mode 100644
index 000000000..5ad7359cc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_xts.h>
+#include <aes_keyexp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_SIZE (4096)
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int t, n;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ int align, size, min_size;
+ unsigned char *efence_pt;
+ unsigned char *efence_ct;
+ unsigned char *efence_dt;
+
+ unsigned char *origin_pt;
+ unsigned char *origin_ct;
+ unsigned char *origin_dt;
+
+ unsigned char key1_exp_enc[16 * 15], key1_exp_dec[16 * 15];
+ unsigned char key2_exp_tw[16 * 15];
+ int i;
+ printf("aes_xts_256 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN);
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ if (memcmp(pt, dt, TEST_LEN)) {
+ printf("fail\n");
+ return -1;
+ }
+ putchar('.');
+
+ // Do tests with random data, keys and message size
+ for (t = 0; t < RANDOMS; t++) {
+ n = rand() % (TEST_LEN);
+ if (n < 17)
+ continue;
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, n);
+ XTS_AES_256_enc(key2, key1, tinit, n, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, n, ct, dt);
+
+ if (memcmp(pt, dt, n)) {
+ printf("fail rand %d, size %d\n", t, n);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // Run tests at end of buffer for Electric Fence
+ align = 1;
+ min_size = 16;
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ origin_pt = malloc(TEST_LEN);
+ origin_ct = malloc(TEST_LEN);
+ origin_dt = malloc(TEST_LEN);
+ if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ for (i = 0; i < 16 * 15; i++) {
+ key2_exp_tw[i] = rand();
+ }
+
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+
+ putchar('.');
+ fflush(0);
+ }
+
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c
new file mode 100644
index 000000000..6b25277dc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c
@@ -0,0 +1,273 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+#include <stdlib.h>
+#include <openssl/evp.h>
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS 128
+#endif
+#define TEST_LOOPS 128
+#define TEST_LEN (1024*1024)
+#define LENGTH_SCAN (2*1024)
+
+/* Generates random data for keys, tweak and plaintext */
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 256 encryption */
+static inline
+ int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv)
+ || (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ || (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))) {
+ printf("\n Error in openssl encoding of %d bytes\n", len);
+ return 1;
+ }
+ return 0;
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 256 decryption */
+static inline
+ int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *ct, unsigned char *dt)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv)
+ || (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len))
+ || (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))) {
+ printf("\n Error in openssl decoding of %d bytes\n", len);
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+
+ unsigned char key1[32], key2[32], tinit[16];
+ unsigned char *pt, *ct, *dt, *refct, *refdt;
+ unsigned char keyssl[64]; /* SSL takes both keys together */
+ int i, j, k, ret;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX *ctx;
+ ctx = EVP_CIPHER_CTX_new();
+
+ /* Allocate space for input and output buffers */
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /**************************** LENGTH SCAN TEST *************************/
+ printf("aes_xts_256_rand_ossl test, %d sets of various length: ", 2 * 1024);
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 32; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 32] = key2[k];
+ }
+
+ for (ret = 0, i = 16; ret == 0 && i < LENGTH_SCAN; i++) {
+
+ /* Encrypt using each method */
+ XTS_AES_256_enc(key2, key1, tinit, i, pt, ct);
+ ret |= openssl_aes_256_xts_enc(ctx, keyssl, tinit, i, pt, refct);
+
+ // Compare
+ for (ret = 0, j = 0; j < i && ret == 0; j++) {
+ if (ct[j] != refct[j])
+ ret = 1;
+ }
+ if (ret)
+ printf(" XTS_AES_256_enc size=%d failed at byte %d!\n", i, j);
+
+ /* Decrypt using each method */
+ XTS_AES_256_dec(key2, key1, tinit, i, ct, dt);
+ ret |= openssl_aes_256_xts_dec(ctx, keyssl, tinit, i, refct, refdt);
+
+ for (k = 0, j = 0; j < TEST_LEN && ret == 0; j++) {
+ if (dt[j] != refdt[j])
+ ret = 1;
+ }
+ if (ret)
+ printf(" XTS_AES_256_dec size=%d failed at byte %d!\n", i, j);
+ if (0 == i % (LENGTH_SCAN / 16))
+ printf(".");
+ fflush(0);
+ }
+ if (ret)
+ return -1;
+ printf("Pass\n");
+
+ /**************************** FIXED LENGTH TEST *************************/
+ printf("aes_xts_256_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN);
+
+ /* Loop over the vectors */
+ for (i = 0; i < TEST_LOOPS; i++) {
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 32; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 32] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ if (openssl_aes_256_xts_enc(ctx, keyssl, tinit, TEST_LEN, pt, refct))
+ return -1;
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_256_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ if (openssl_aes_256_xts_dec(ctx, keyssl, tinit, TEST_LEN, refct, refdt))
+ return -1;
+
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_256_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ if (0 == i % (TEST_LOOPS / 16))
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ /**************************** RANDOM LENGTH TEST *************************/
+ printf("aes_xts_256_rand_ossl test, %d sets of random lengths: ", RANDOMS);
+
+ /* Run tests with random size */
+
+ unsigned int rand_len, t;
+
+ for (t = 0; t < RANDOMS; t++) {
+
+ rand_len = rand() % (TEST_LEN);
+ rand_len = rand_len < 16 ? 16 : rand_len;
+ xts256_mk_rand_data(key1, key2, tinit, pt, rand_len);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 32; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 32] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_256_enc(key2, key1, tinit, rand_len, pt, ct);
+ if (openssl_aes_256_xts_enc(ctx, keyssl, tinit, rand_len, pt, refct))
+ return -1;
+
+ /* Carry out comparison of the calculated ciphertext with
+ * the reference
+ */
+ for (j = 0; j < rand_len; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_256_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_256_dec(key2, key1, tinit, rand_len, ct, dt);
+ if (openssl_aes_256_xts_dec(ctx, keyssl, tinit, rand_len, refct, refdt))
+ return -1;
+
+ for (j = 0; j < rand_len; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_256_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ if (0 == t % (RANDOMS / 16))
+ printf(".");
+ fflush(0);
+ }
+
+ EVP_CIPHER_CTX_free(ctx);
+
+ printf("Pass\n");
+
+ printf("aes_xts_256_rand_ossl: All tests passed\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c
new file mode 100644
index 000000000..2c961f44f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c
@@ -0,0 +1,105 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "xts_256_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return -1;
+ }
+
+ XTS_AES_256_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ printf("\nXTS_AES_256_enc: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+
+ ct_test = NULL;
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return -1;
+ }
+
+ XTS_AES_256_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ printf("\nXTS_AES_256_dec: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+
+ pt_test = NULL;
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h
new file mode 100644
index 000000000..5a893f173
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h
@@ -0,0 +1,1035 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+
+#define NVEC 5
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+struct xts_vector {
+ uint64_t ptlen; // length of our plaintext
+ uint8_t *key1; // dimension 16 for 128 bit aes
+ uint8_t *key2; // dimension 16 for 128 bit aes
+ uint8_t *TW; // dimension 16 for both 128 and 256 bit
+ uint8_t *PTX; // min. dimension 16
+ uint8_t *CTX; // same dimension as PTX
+};
+
+/* Define our test vectors statically here. Test vectors are from the standard:
+ * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented
+ * Storage Devices"
+ * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450
+ *
+ * Vector 10
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 1c3b3a102f770386e4836c99e370cf9bea00803f5e482357a4ae12d414a3e63b
+ * CTX 5d31e276f8fe4a8d66b317f9ac683f44680a86ac35adfc3345befecb4bb188fd
+ * CTX 5776926c49a3095eb108fd1098baec70aaa66999a72a82f27d848b21d4a741b0
+ * CTX c5cd4d5fff9dac89aeba122961d03a757123e9870f8acf1000020887891429ca
+ * CTX 2a3e7a7d7df7b10355165c8b9a6d0a7de8b062c4500dc4cd120c0f7418dae3d0
+ * CTX b5781c34803fa75421c790dfe1de1834f280d7667b327f6c8cd7557e12ac3a0f
+ * CTX 93ec05c52e0493ef31a12d3d9260f79a289d6a379bc70c50841473d1a8cc81ec
+ * CTX 583e9645e07b8d9670655ba5bbcfecc6dc3966380ad8fecb17b6ba02469a020a
+ * CTX 84e18e8f84252070c13e9f1f289be54fbc481457778f616015e1327a02b140f1
+ * CTX 505eb309326d68378f8374595c849d84f4c333ec4423885143cb47bd71c5edae
+ * CTX 9be69a2ffeceb1bec9de244fbe15992b11b77c040f12bd8f6a975a44a0f90c29
+ * CTX a9abc3d4d893927284c58754cce294529f8614dcd2aba991925fedc4ae74ffac
+ * CTX 6e333b93eb4aff0479da9a410e4450e0dd7ae4c6e2910900575da401fc07059f
+ * CTX 645e8b7e9bfdef33943054ff84011493c27b3429eaedb4ed5376441a77ed4385
+ * CTX 1ad77f16f541dfd269d50d6a5f14fb0aab1cbb4c1550be97f7ab4066193c4caa
+ * CTX 773dad38014bd2092fa755c824bb5e54c4f36ffda9fcea70b9c6e693e148c151
+ * Plaintext length (bytes): 512
+ */
+
+static uint8_t v10_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v10_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v10_TW[16] = {
+ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v10_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v10_CTX[512] = {
+ 0x1c, 0x3b, 0x3a, 0x10, 0x2f, 0x77, 0x03, 0x86,
+ 0xe4, 0x83, 0x6c, 0x99, 0xe3, 0x70, 0xcf, 0x9b,
+ 0xea, 0x00, 0x80, 0x3f, 0x5e, 0x48, 0x23, 0x57,
+ 0xa4, 0xae, 0x12, 0xd4, 0x14, 0xa3, 0xe6, 0x3b,
+ 0x5d, 0x31, 0xe2, 0x76, 0xf8, 0xfe, 0x4a, 0x8d,
+ 0x66, 0xb3, 0x17, 0xf9, 0xac, 0x68, 0x3f, 0x44,
+ 0x68, 0x0a, 0x86, 0xac, 0x35, 0xad, 0xfc, 0x33,
+ 0x45, 0xbe, 0xfe, 0xcb, 0x4b, 0xb1, 0x88, 0xfd,
+ 0x57, 0x76, 0x92, 0x6c, 0x49, 0xa3, 0x09, 0x5e,
+ 0xb1, 0x08, 0xfd, 0x10, 0x98, 0xba, 0xec, 0x70,
+ 0xaa, 0xa6, 0x69, 0x99, 0xa7, 0x2a, 0x82, 0xf2,
+ 0x7d, 0x84, 0x8b, 0x21, 0xd4, 0xa7, 0x41, 0xb0,
+ 0xc5, 0xcd, 0x4d, 0x5f, 0xff, 0x9d, 0xac, 0x89,
+ 0xae, 0xba, 0x12, 0x29, 0x61, 0xd0, 0x3a, 0x75,
+ 0x71, 0x23, 0xe9, 0x87, 0x0f, 0x8a, 0xcf, 0x10,
+ 0x00, 0x02, 0x08, 0x87, 0x89, 0x14, 0x29, 0xca,
+ 0x2a, 0x3e, 0x7a, 0x7d, 0x7d, 0xf7, 0xb1, 0x03,
+ 0x55, 0x16, 0x5c, 0x8b, 0x9a, 0x6d, 0x0a, 0x7d,
+ 0xe8, 0xb0, 0x62, 0xc4, 0x50, 0x0d, 0xc4, 0xcd,
+ 0x12, 0x0c, 0x0f, 0x74, 0x18, 0xda, 0xe3, 0xd0,
+ 0xb5, 0x78, 0x1c, 0x34, 0x80, 0x3f, 0xa7, 0x54,
+ 0x21, 0xc7, 0x90, 0xdf, 0xe1, 0xde, 0x18, 0x34,
+ 0xf2, 0x80, 0xd7, 0x66, 0x7b, 0x32, 0x7f, 0x6c,
+ 0x8c, 0xd7, 0x55, 0x7e, 0x12, 0xac, 0x3a, 0x0f,
+ 0x93, 0xec, 0x05, 0xc5, 0x2e, 0x04, 0x93, 0xef,
+ 0x31, 0xa1, 0x2d, 0x3d, 0x92, 0x60, 0xf7, 0x9a,
+ 0x28, 0x9d, 0x6a, 0x37, 0x9b, 0xc7, 0x0c, 0x50,
+ 0x84, 0x14, 0x73, 0xd1, 0xa8, 0xcc, 0x81, 0xec,
+ 0x58, 0x3e, 0x96, 0x45, 0xe0, 0x7b, 0x8d, 0x96,
+ 0x70, 0x65, 0x5b, 0xa5, 0xbb, 0xcf, 0xec, 0xc6,
+ 0xdc, 0x39, 0x66, 0x38, 0x0a, 0xd8, 0xfe, 0xcb,
+ 0x17, 0xb6, 0xba, 0x02, 0x46, 0x9a, 0x02, 0x0a,
+ 0x84, 0xe1, 0x8e, 0x8f, 0x84, 0x25, 0x20, 0x70,
+ 0xc1, 0x3e, 0x9f, 0x1f, 0x28, 0x9b, 0xe5, 0x4f,
+ 0xbc, 0x48, 0x14, 0x57, 0x77, 0x8f, 0x61, 0x60,
+ 0x15, 0xe1, 0x32, 0x7a, 0x02, 0xb1, 0x40, 0xf1,
+ 0x50, 0x5e, 0xb3, 0x09, 0x32, 0x6d, 0x68, 0x37,
+ 0x8f, 0x83, 0x74, 0x59, 0x5c, 0x84, 0x9d, 0x84,
+ 0xf4, 0xc3, 0x33, 0xec, 0x44, 0x23, 0x88, 0x51,
+ 0x43, 0xcb, 0x47, 0xbd, 0x71, 0xc5, 0xed, 0xae,
+ 0x9b, 0xe6, 0x9a, 0x2f, 0xfe, 0xce, 0xb1, 0xbe,
+ 0xc9, 0xde, 0x24, 0x4f, 0xbe, 0x15, 0x99, 0x2b,
+ 0x11, 0xb7, 0x7c, 0x04, 0x0f, 0x12, 0xbd, 0x8f,
+ 0x6a, 0x97, 0x5a, 0x44, 0xa0, 0xf9, 0x0c, 0x29,
+ 0xa9, 0xab, 0xc3, 0xd4, 0xd8, 0x93, 0x92, 0x72,
+ 0x84, 0xc5, 0x87, 0x54, 0xcc, 0xe2, 0x94, 0x52,
+ 0x9f, 0x86, 0x14, 0xdc, 0xd2, 0xab, 0xa9, 0x91,
+ 0x92, 0x5f, 0xed, 0xc4, 0xae, 0x74, 0xff, 0xac,
+ 0x6e, 0x33, 0x3b, 0x93, 0xeb, 0x4a, 0xff, 0x04,
+ 0x79, 0xda, 0x9a, 0x41, 0x0e, 0x44, 0x50, 0xe0,
+ 0xdd, 0x7a, 0xe4, 0xc6, 0xe2, 0x91, 0x09, 0x00,
+ 0x57, 0x5d, 0xa4, 0x01, 0xfc, 0x07, 0x05, 0x9f,
+ 0x64, 0x5e, 0x8b, 0x7e, 0x9b, 0xfd, 0xef, 0x33,
+ 0x94, 0x30, 0x54, 0xff, 0x84, 0x01, 0x14, 0x93,
+ 0xc2, 0x7b, 0x34, 0x29, 0xea, 0xed, 0xb4, 0xed,
+ 0x53, 0x76, 0x44, 0x1a, 0x77, 0xed, 0x43, 0x85,
+ 0x1a, 0xd7, 0x7f, 0x16, 0xf5, 0x41, 0xdf, 0xd2,
+ 0x69, 0xd5, 0x0d, 0x6a, 0x5f, 0x14, 0xfb, 0x0a,
+ 0xab, 0x1c, 0xbb, 0x4c, 0x15, 0x50, 0xbe, 0x97,
+ 0xf7, 0xab, 0x40, 0x66, 0x19, 0x3c, 0x4c, 0xaa,
+ 0x77, 0x3d, 0xad, 0x38, 0x01, 0x4b, 0xd2, 0x09,
+ 0x2f, 0xa7, 0x55, 0xc8, 0x24, 0xbb, 0x5e, 0x54,
+ 0xc4, 0xf3, 0x6f, 0xfd, 0xa9, 0xfc, 0xea, 0x70,
+ 0xb9, 0xc6, 0xe6, 0x93, 0xe1, 0x48, 0xc1, 0x51
+};
+
+/*
+ * Vector 11
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 77a31251618a15e6b92d1d66dffe7b50b50bad552305ba0217a610688eff7e11
+ * CTX e1d0225438e093242d6db274fde801d4cae06f2092c728b2478559df58e837c2
+ * CTX 469ee4a4fa794e4bbc7f39bc026e3cb72c33b0888f25b4acf56a2a9804f1ce6d
+ * CTX 3d6e1dc6ca181d4b546179d55544aa7760c40d06741539c7e3cd9d2f6650b201
+ * CTX 3fd0eeb8c2b8e3d8d240ccae2d4c98320a7442e1c8d75a42d6e6cfa4c2eca179
+ * CTX 8d158c7aecdf82490f24bb9b38e108bcda12c3faf9a21141c3613b58367f922a
+ * CTX aa26cd22f23d708dae699ad7cb40a8ad0b6e2784973dcb605684c08b8d6998c6
+ * CTX 9aac049921871ebb65301a4619ca80ecb485a31d744223ce8ddc2394828d6a80
+ * CTX 470c092f5ba413c3378fa6054255c6f9df4495862bbb3287681f931b687c888a
+ * CTX bf844dfc8fc28331e579928cd12bd2390ae123cf03818d14dedde5c0c24c8ab0
+ * CTX 18bfca75ca096f2d531f3d1619e785f1ada437cab92e980558b3dce1474afb75
+ * CTX bfedbf8ff54cb2618e0244c9ac0d3c66fb51598cd2db11f9be39791abe447c63
+ * CTX 094f7c453b7ff87cb5bb36b7c79efb0872d17058b83b15ab0866ad8a58656c5a
+ * CTX 7e20dbdf308b2461d97c0ec0024a2715055249cf3b478ddd4740de654f75ca68
+ * CTX 6e0d7345c69ed50cdc2a8b332b1f8824108ac937eb050585608ee734097fc090
+ * CTX 54fbff89eeaeea791f4a7ab1f9868294a4f9e27b42af8100cb9d59cef9645803
+ * Plaintext length (bytes): 512
+ *
+*/
+static uint8_t v11_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v11_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v11_TW[16] = {
+ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v11_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v11_CTX[512] = {
+ 0x77, 0xa3, 0x12, 0x51, 0x61, 0x8a, 0x15, 0xe6,
+ 0xb9, 0x2d, 0x1d, 0x66, 0xdf, 0xfe, 0x7b, 0x50,
+ 0xb5, 0x0b, 0xad, 0x55, 0x23, 0x05, 0xba, 0x02,
+ 0x17, 0xa6, 0x10, 0x68, 0x8e, 0xff, 0x7e, 0x11,
+ 0xe1, 0xd0, 0x22, 0x54, 0x38, 0xe0, 0x93, 0x24,
+ 0x2d, 0x6d, 0xb2, 0x74, 0xfd, 0xe8, 0x01, 0xd4,
+ 0xca, 0xe0, 0x6f, 0x20, 0x92, 0xc7, 0x28, 0xb2,
+ 0x47, 0x85, 0x59, 0xdf, 0x58, 0xe8, 0x37, 0xc2,
+ 0x46, 0x9e, 0xe4, 0xa4, 0xfa, 0x79, 0x4e, 0x4b,
+ 0xbc, 0x7f, 0x39, 0xbc, 0x02, 0x6e, 0x3c, 0xb7,
+ 0x2c, 0x33, 0xb0, 0x88, 0x8f, 0x25, 0xb4, 0xac,
+ 0xf5, 0x6a, 0x2a, 0x98, 0x04, 0xf1, 0xce, 0x6d,
+ 0x3d, 0x6e, 0x1d, 0xc6, 0xca, 0x18, 0x1d, 0x4b,
+ 0x54, 0x61, 0x79, 0xd5, 0x55, 0x44, 0xaa, 0x77,
+ 0x60, 0xc4, 0x0d, 0x06, 0x74, 0x15, 0x39, 0xc7,
+ 0xe3, 0xcd, 0x9d, 0x2f, 0x66, 0x50, 0xb2, 0x01,
+ 0x3f, 0xd0, 0xee, 0xb8, 0xc2, 0xb8, 0xe3, 0xd8,
+ 0xd2, 0x40, 0xcc, 0xae, 0x2d, 0x4c, 0x98, 0x32,
+ 0x0a, 0x74, 0x42, 0xe1, 0xc8, 0xd7, 0x5a, 0x42,
+ 0xd6, 0xe6, 0xcf, 0xa4, 0xc2, 0xec, 0xa1, 0x79,
+ 0x8d, 0x15, 0x8c, 0x7a, 0xec, 0xdf, 0x82, 0x49,
+ 0x0f, 0x24, 0xbb, 0x9b, 0x38, 0xe1, 0x08, 0xbc,
+ 0xda, 0x12, 0xc3, 0xfa, 0xf9, 0xa2, 0x11, 0x41,
+ 0xc3, 0x61, 0x3b, 0x58, 0x36, 0x7f, 0x92, 0x2a,
+ 0xaa, 0x26, 0xcd, 0x22, 0xf2, 0x3d, 0x70, 0x8d,
+ 0xae, 0x69, 0x9a, 0xd7, 0xcb, 0x40, 0xa8, 0xad,
+ 0x0b, 0x6e, 0x27, 0x84, 0x97, 0x3d, 0xcb, 0x60,
+ 0x56, 0x84, 0xc0, 0x8b, 0x8d, 0x69, 0x98, 0xc6,
+ 0x9a, 0xac, 0x04, 0x99, 0x21, 0x87, 0x1e, 0xbb,
+ 0x65, 0x30, 0x1a, 0x46, 0x19, 0xca, 0x80, 0xec,
+ 0xb4, 0x85, 0xa3, 0x1d, 0x74, 0x42, 0x23, 0xce,
+ 0x8d, 0xdc, 0x23, 0x94, 0x82, 0x8d, 0x6a, 0x80,
+ 0x47, 0x0c, 0x09, 0x2f, 0x5b, 0xa4, 0x13, 0xc3,
+ 0x37, 0x8f, 0xa6, 0x05, 0x42, 0x55, 0xc6, 0xf9,
+ 0xdf, 0x44, 0x95, 0x86, 0x2b, 0xbb, 0x32, 0x87,
+ 0x68, 0x1f, 0x93, 0x1b, 0x68, 0x7c, 0x88, 0x8a,
+ 0xbf, 0x84, 0x4d, 0xfc, 0x8f, 0xc2, 0x83, 0x31,
+ 0xe5, 0x79, 0x92, 0x8c, 0xd1, 0x2b, 0xd2, 0x39,
+ 0x0a, 0xe1, 0x23, 0xcf, 0x03, 0x81, 0x8d, 0x14,
+ 0xde, 0xdd, 0xe5, 0xc0, 0xc2, 0x4c, 0x8a, 0xb0,
+ 0x18, 0xbf, 0xca, 0x75, 0xca, 0x09, 0x6f, 0x2d,
+ 0x53, 0x1f, 0x3d, 0x16, 0x19, 0xe7, 0x85, 0xf1,
+ 0xad, 0xa4, 0x37, 0xca, 0xb9, 0x2e, 0x98, 0x05,
+ 0x58, 0xb3, 0xdc, 0xe1, 0x47, 0x4a, 0xfb, 0x75,
+ 0xbf, 0xed, 0xbf, 0x8f, 0xf5, 0x4c, 0xb2, 0x61,
+ 0x8e, 0x02, 0x44, 0xc9, 0xac, 0x0d, 0x3c, 0x66,
+ 0xfb, 0x51, 0x59, 0x8c, 0xd2, 0xdb, 0x11, 0xf9,
+ 0xbe, 0x39, 0x79, 0x1a, 0xbe, 0x44, 0x7c, 0x63,
+ 0x09, 0x4f, 0x7c, 0x45, 0x3b, 0x7f, 0xf8, 0x7c,
+ 0xb5, 0xbb, 0x36, 0xb7, 0xc7, 0x9e, 0xfb, 0x08,
+ 0x72, 0xd1, 0x70, 0x58, 0xb8, 0x3b, 0x15, 0xab,
+ 0x08, 0x66, 0xad, 0x8a, 0x58, 0x65, 0x6c, 0x5a,
+ 0x7e, 0x20, 0xdb, 0xdf, 0x30, 0x8b, 0x24, 0x61,
+ 0xd9, 0x7c, 0x0e, 0xc0, 0x02, 0x4a, 0x27, 0x15,
+ 0x05, 0x52, 0x49, 0xcf, 0x3b, 0x47, 0x8d, 0xdd,
+ 0x47, 0x40, 0xde, 0x65, 0x4f, 0x75, 0xca, 0x68,
+ 0x6e, 0x0d, 0x73, 0x45, 0xc6, 0x9e, 0xd5, 0x0c,
+ 0xdc, 0x2a, 0x8b, 0x33, 0x2b, 0x1f, 0x88, 0x24,
+ 0x10, 0x8a, 0xc9, 0x37, 0xeb, 0x05, 0x05, 0x85,
+ 0x60, 0x8e, 0xe7, 0x34, 0x09, 0x7f, 0xc0, 0x90,
+ 0x54, 0xfb, 0xff, 0x89, 0xee, 0xae, 0xea, 0x79,
+ 0x1f, 0x4a, 0x7a, 0xb1, 0xf9, 0x86, 0x82, 0x94,
+ 0xa4, 0xf9, 0xe2, 0x7b, 0x42, 0xaf, 0x81, 0x00,
+ 0xcb, 0x9d, 0x59, 0xce, 0xf9, 0x64, 0x58, 0x03
+};
+
+/*
+ * Vector 12
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX e387aaa58ba483afa7e8eb469778317ecf4cf573aa9d4eac23f2cdf914e4e200
+ * CTX a8b490e42ee646802dc6ee2b471b278195d60918ececb44bf79966f83faba049
+ * CTX 9298ebc699c0c8634715a320bb4f075d622e74c8c932004f25b41e361025b5a8
+ * CTX 7815391f6108fc4afa6a05d9303c6ba68a128a55705d415985832fdeaae6c8e1
+ * CTX 9110e84d1b1f199a2692119edc96132658f09da7c623efcec712537a3d94c0bf
+ * CTX 5d7e352ec94ae5797fdb377dc1551150721adf15bd26a8efc2fcaad56881fa9e
+ * CTX 62462c28f30ae1ceaca93c345cf243b73f542e2074a705bd2643bb9f7cc79bb6
+ * CTX e7091ea6e232df0f9ad0d6cf502327876d82207abf2115cdacf6d5a48f6c1879
+ * CTX a65b115f0f8b3cb3c59d15dd8c769bc014795a1837f3901b5845eb491adfefe0
+ * CTX 97b1fa30a12fc1f65ba22905031539971a10f2f36c321bb51331cdefb39e3964
+ * CTX c7ef079994f5b69b2edd83a71ef549971ee93f44eac3938fcdd61d01fa71799d
+ * CTX a3a8091c4c48aa9ed263ff0749df95d44fef6a0bb578ec69456aa5408ae32c7a
+ * CTX f08ad7ba8921287e3bbee31b767be06a0e705c864a769137df28292283ea81a2
+ * CTX 480241b44d9921cdbec1bc28dc1fda114bd8e5217ac9d8ebafa720e9da4f9ace
+ * CTX 231cc949e5b96fe76ffc21063fddc83a6b8679c00d35e09576a875305bed5f36
+ * CTX ed242c8900dd1fa965bc950dfce09b132263a1eef52dd6888c309f5a7d712826
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v12_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v12_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v12_TW[16] = {
+ 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v12_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v12_CTX[512] = {
+ 0xe3, 0x87, 0xaa, 0xa5, 0x8b, 0xa4, 0x83, 0xaf,
+ 0xa7, 0xe8, 0xeb, 0x46, 0x97, 0x78, 0x31, 0x7e,
+ 0xcf, 0x4c, 0xf5, 0x73, 0xaa, 0x9d, 0x4e, 0xac,
+ 0x23, 0xf2, 0xcd, 0xf9, 0x14, 0xe4, 0xe2, 0x00,
+ 0xa8, 0xb4, 0x90, 0xe4, 0x2e, 0xe6, 0x46, 0x80,
+ 0x2d, 0xc6, 0xee, 0x2b, 0x47, 0x1b, 0x27, 0x81,
+ 0x95, 0xd6, 0x09, 0x18, 0xec, 0xec, 0xb4, 0x4b,
+ 0xf7, 0x99, 0x66, 0xf8, 0x3f, 0xab, 0xa0, 0x49,
+ 0x92, 0x98, 0xeb, 0xc6, 0x99, 0xc0, 0xc8, 0x63,
+ 0x47, 0x15, 0xa3, 0x20, 0xbb, 0x4f, 0x07, 0x5d,
+ 0x62, 0x2e, 0x74, 0xc8, 0xc9, 0x32, 0x00, 0x4f,
+ 0x25, 0xb4, 0x1e, 0x36, 0x10, 0x25, 0xb5, 0xa8,
+ 0x78, 0x15, 0x39, 0x1f, 0x61, 0x08, 0xfc, 0x4a,
+ 0xfa, 0x6a, 0x05, 0xd9, 0x30, 0x3c, 0x6b, 0xa6,
+ 0x8a, 0x12, 0x8a, 0x55, 0x70, 0x5d, 0x41, 0x59,
+ 0x85, 0x83, 0x2f, 0xde, 0xaa, 0xe6, 0xc8, 0xe1,
+ 0x91, 0x10, 0xe8, 0x4d, 0x1b, 0x1f, 0x19, 0x9a,
+ 0x26, 0x92, 0x11, 0x9e, 0xdc, 0x96, 0x13, 0x26,
+ 0x58, 0xf0, 0x9d, 0xa7, 0xc6, 0x23, 0xef, 0xce,
+ 0xc7, 0x12, 0x53, 0x7a, 0x3d, 0x94, 0xc0, 0xbf,
+ 0x5d, 0x7e, 0x35, 0x2e, 0xc9, 0x4a, 0xe5, 0x79,
+ 0x7f, 0xdb, 0x37, 0x7d, 0xc1, 0x55, 0x11, 0x50,
+ 0x72, 0x1a, 0xdf, 0x15, 0xbd, 0x26, 0xa8, 0xef,
+ 0xc2, 0xfc, 0xaa, 0xd5, 0x68, 0x81, 0xfa, 0x9e,
+ 0x62, 0x46, 0x2c, 0x28, 0xf3, 0x0a, 0xe1, 0xce,
+ 0xac, 0xa9, 0x3c, 0x34, 0x5c, 0xf2, 0x43, 0xb7,
+ 0x3f, 0x54, 0x2e, 0x20, 0x74, 0xa7, 0x05, 0xbd,
+ 0x26, 0x43, 0xbb, 0x9f, 0x7c, 0xc7, 0x9b, 0xb6,
+ 0xe7, 0x09, 0x1e, 0xa6, 0xe2, 0x32, 0xdf, 0x0f,
+ 0x9a, 0xd0, 0xd6, 0xcf, 0x50, 0x23, 0x27, 0x87,
+ 0x6d, 0x82, 0x20, 0x7a, 0xbf, 0x21, 0x15, 0xcd,
+ 0xac, 0xf6, 0xd5, 0xa4, 0x8f, 0x6c, 0x18, 0x79,
+ 0xa6, 0x5b, 0x11, 0x5f, 0x0f, 0x8b, 0x3c, 0xb3,
+ 0xc5, 0x9d, 0x15, 0xdd, 0x8c, 0x76, 0x9b, 0xc0,
+ 0x14, 0x79, 0x5a, 0x18, 0x37, 0xf3, 0x90, 0x1b,
+ 0x58, 0x45, 0xeb, 0x49, 0x1a, 0xdf, 0xef, 0xe0,
+ 0x97, 0xb1, 0xfa, 0x30, 0xa1, 0x2f, 0xc1, 0xf6,
+ 0x5b, 0xa2, 0x29, 0x05, 0x03, 0x15, 0x39, 0x97,
+ 0x1a, 0x10, 0xf2, 0xf3, 0x6c, 0x32, 0x1b, 0xb5,
+ 0x13, 0x31, 0xcd, 0xef, 0xb3, 0x9e, 0x39, 0x64,
+ 0xc7, 0xef, 0x07, 0x99, 0x94, 0xf5, 0xb6, 0x9b,
+ 0x2e, 0xdd, 0x83, 0xa7, 0x1e, 0xf5, 0x49, 0x97,
+ 0x1e, 0xe9, 0x3f, 0x44, 0xea, 0xc3, 0x93, 0x8f,
+ 0xcd, 0xd6, 0x1d, 0x01, 0xfa, 0x71, 0x79, 0x9d,
+ 0xa3, 0xa8, 0x09, 0x1c, 0x4c, 0x48, 0xaa, 0x9e,
+ 0xd2, 0x63, 0xff, 0x07, 0x49, 0xdf, 0x95, 0xd4,
+ 0x4f, 0xef, 0x6a, 0x0b, 0xb5, 0x78, 0xec, 0x69,
+ 0x45, 0x6a, 0xa5, 0x40, 0x8a, 0xe3, 0x2c, 0x7a,
+ 0xf0, 0x8a, 0xd7, 0xba, 0x89, 0x21, 0x28, 0x7e,
+ 0x3b, 0xbe, 0xe3, 0x1b, 0x76, 0x7b, 0xe0, 0x6a,
+ 0x0e, 0x70, 0x5c, 0x86, 0x4a, 0x76, 0x91, 0x37,
+ 0xdf, 0x28, 0x29, 0x22, 0x83, 0xea, 0x81, 0xa2,
+ 0x48, 0x02, 0x41, 0xb4, 0x4d, 0x99, 0x21, 0xcd,
+ 0xbe, 0xc1, 0xbc, 0x28, 0xdc, 0x1f, 0xda, 0x11,
+ 0x4b, 0xd8, 0xe5, 0x21, 0x7a, 0xc9, 0xd8, 0xeb,
+ 0xaf, 0xa7, 0x20, 0xe9, 0xda, 0x4f, 0x9a, 0xce,
+ 0x23, 0x1c, 0xc9, 0x49, 0xe5, 0xb9, 0x6f, 0xe7,
+ 0x6f, 0xfc, 0x21, 0x06, 0x3f, 0xdd, 0xc8, 0x3a,
+ 0x6b, 0x86, 0x79, 0xc0, 0x0d, 0x35, 0xe0, 0x95,
+ 0x76, 0xa8, 0x75, 0x30, 0x5b, 0xed, 0x5f, 0x36,
+ 0xed, 0x24, 0x2c, 0x89, 0x00, 0xdd, 0x1f, 0xa9,
+ 0x65, 0xbc, 0x95, 0x0d, 0xfc, 0xe0, 0x9b, 0x13,
+ 0x22, 0x63, 0xa1, 0xee, 0xf5, 0x2d, 0xd6, 0x88,
+ 0x8c, 0x30, 0x9f, 0x5a, 0x7d, 0x71, 0x28, 0x26
+};
+
+/*
+ * Vector 13
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX bf53d2dade78e822a4d949a9bc6766b01b06a8ef70d26748c6a7fc36d80ae4c5
+ * CTX 520f7c4ab0ac8544424fa405162fef5a6b7f229498063618d39f0003cb5fb8d1
+ * CTX c86b643497da1ff945c8d3bedeca4f479702a7a735f043ddb1d6aaade3c4a0ac
+ * CTX 7ca7f3fa5279bef56f82cd7a2f38672e824814e10700300a055e1630b8f1cb0e
+ * CTX 919f5e942010a416e2bf48cb46993d3cb6a51c19bacf864785a00bc2ecff15d3
+ * CTX 50875b246ed53e68be6f55bd7e05cfc2b2ed6432198a6444b6d8c247fab941f5
+ * CTX 69768b5c429366f1d3f00f0345b96123d56204c01c63b22ce78baf116e525ed9
+ * CTX 0fdea39fa469494d3866c31e05f295ff21fea8d4e6e13d67e47ce722e9698a1c
+ * CTX 1048d68ebcde76b86fcf976eab8aa9790268b7068e017a8b9b749409514f1053
+ * CTX 027fd16c3786ea1bac5f15cb79711ee2abe82f5cf8b13ae73030ef5b9e4457e7
+ * CTX 5d1304f988d62dd6fc4b94ed38ba831da4b7634971b6cd8ec325d9c61c00f1df
+ * CTX 73627ed3745a5e8489f3a95c69639c32cd6e1d537a85f75cc844726e8a72fc00
+ * CTX 77ad22000f1d5078f6b866318c668f1ad03d5a5fced5219f2eabbd0aa5c0f460
+ * CTX d183f04404a0d6f469558e81fab24a167905ab4c7878502ad3e38fdbe62a4155
+ * CTX 6cec37325759533ce8f25f367c87bb5578d667ae93f9e2fd99bcbc5f2fbba88c
+ * CTX f6516139420fcff3b7361d86322c4bd84c82f335abb152c4a93411373aaa8220
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v13_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v13_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v13_TW[16] = {
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v13_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v13_CTX[512] = {
+ 0xbf, 0x53, 0xd2, 0xda, 0xde, 0x78, 0xe8, 0x22,
+ 0xa4, 0xd9, 0x49, 0xa9, 0xbc, 0x67, 0x66, 0xb0,
+ 0x1b, 0x06, 0xa8, 0xef, 0x70, 0xd2, 0x67, 0x48,
+ 0xc6, 0xa7, 0xfc, 0x36, 0xd8, 0x0a, 0xe4, 0xc5,
+ 0x52, 0x0f, 0x7c, 0x4a, 0xb0, 0xac, 0x85, 0x44,
+ 0x42, 0x4f, 0xa4, 0x05, 0x16, 0x2f, 0xef, 0x5a,
+ 0x6b, 0x7f, 0x22, 0x94, 0x98, 0x06, 0x36, 0x18,
+ 0xd3, 0x9f, 0x00, 0x03, 0xcb, 0x5f, 0xb8, 0xd1,
+ 0xc8, 0x6b, 0x64, 0x34, 0x97, 0xda, 0x1f, 0xf9,
+ 0x45, 0xc8, 0xd3, 0xbe, 0xde, 0xca, 0x4f, 0x47,
+ 0x97, 0x02, 0xa7, 0xa7, 0x35, 0xf0, 0x43, 0xdd,
+ 0xb1, 0xd6, 0xaa, 0xad, 0xe3, 0xc4, 0xa0, 0xac,
+ 0x7c, 0xa7, 0xf3, 0xfa, 0x52, 0x79, 0xbe, 0xf5,
+ 0x6f, 0x82, 0xcd, 0x7a, 0x2f, 0x38, 0x67, 0x2e,
+ 0x82, 0x48, 0x14, 0xe1, 0x07, 0x00, 0x30, 0x0a,
+ 0x05, 0x5e, 0x16, 0x30, 0xb8, 0xf1, 0xcb, 0x0e,
+ 0x91, 0x9f, 0x5e, 0x94, 0x20, 0x10, 0xa4, 0x16,
+ 0xe2, 0xbf, 0x48, 0xcb, 0x46, 0x99, 0x3d, 0x3c,
+ 0xb6, 0xa5, 0x1c, 0x19, 0xba, 0xcf, 0x86, 0x47,
+ 0x85, 0xa0, 0x0b, 0xc2, 0xec, 0xff, 0x15, 0xd3,
+ 0x50, 0x87, 0x5b, 0x24, 0x6e, 0xd5, 0x3e, 0x68,
+ 0xbe, 0x6f, 0x55, 0xbd, 0x7e, 0x05, 0xcf, 0xc2,
+ 0xb2, 0xed, 0x64, 0x32, 0x19, 0x8a, 0x64, 0x44,
+ 0xb6, 0xd8, 0xc2, 0x47, 0xfa, 0xb9, 0x41, 0xf5,
+ 0x69, 0x76, 0x8b, 0x5c, 0x42, 0x93, 0x66, 0xf1,
+ 0xd3, 0xf0, 0x0f, 0x03, 0x45, 0xb9, 0x61, 0x23,
+ 0xd5, 0x62, 0x04, 0xc0, 0x1c, 0x63, 0xb2, 0x2c,
+ 0xe7, 0x8b, 0xaf, 0x11, 0x6e, 0x52, 0x5e, 0xd9,
+ 0x0f, 0xde, 0xa3, 0x9f, 0xa4, 0x69, 0x49, 0x4d,
+ 0x38, 0x66, 0xc3, 0x1e, 0x05, 0xf2, 0x95, 0xff,
+ 0x21, 0xfe, 0xa8, 0xd4, 0xe6, 0xe1, 0x3d, 0x67,
+ 0xe4, 0x7c, 0xe7, 0x22, 0xe9, 0x69, 0x8a, 0x1c,
+ 0x10, 0x48, 0xd6, 0x8e, 0xbc, 0xde, 0x76, 0xb8,
+ 0x6f, 0xcf, 0x97, 0x6e, 0xab, 0x8a, 0xa9, 0x79,
+ 0x02, 0x68, 0xb7, 0x06, 0x8e, 0x01, 0x7a, 0x8b,
+ 0x9b, 0x74, 0x94, 0x09, 0x51, 0x4f, 0x10, 0x53,
+ 0x02, 0x7f, 0xd1, 0x6c, 0x37, 0x86, 0xea, 0x1b,
+ 0xac, 0x5f, 0x15, 0xcb, 0x79, 0x71, 0x1e, 0xe2,
+ 0xab, 0xe8, 0x2f, 0x5c, 0xf8, 0xb1, 0x3a, 0xe7,
+ 0x30, 0x30, 0xef, 0x5b, 0x9e, 0x44, 0x57, 0xe7,
+ 0x5d, 0x13, 0x04, 0xf9, 0x88, 0xd6, 0x2d, 0xd6,
+ 0xfc, 0x4b, 0x94, 0xed, 0x38, 0xba, 0x83, 0x1d,
+ 0xa4, 0xb7, 0x63, 0x49, 0x71, 0xb6, 0xcd, 0x8e,
+ 0xc3, 0x25, 0xd9, 0xc6, 0x1c, 0x00, 0xf1, 0xdf,
+ 0x73, 0x62, 0x7e, 0xd3, 0x74, 0x5a, 0x5e, 0x84,
+ 0x89, 0xf3, 0xa9, 0x5c, 0x69, 0x63, 0x9c, 0x32,
+ 0xcd, 0x6e, 0x1d, 0x53, 0x7a, 0x85, 0xf7, 0x5c,
+ 0xc8, 0x44, 0x72, 0x6e, 0x8a, 0x72, 0xfc, 0x00,
+ 0x77, 0xad, 0x22, 0x00, 0x0f, 0x1d, 0x50, 0x78,
+ 0xf6, 0xb8, 0x66, 0x31, 0x8c, 0x66, 0x8f, 0x1a,
+ 0xd0, 0x3d, 0x5a, 0x5f, 0xce, 0xd5, 0x21, 0x9f,
+ 0x2e, 0xab, 0xbd, 0x0a, 0xa5, 0xc0, 0xf4, 0x60,
+ 0xd1, 0x83, 0xf0, 0x44, 0x04, 0xa0, 0xd6, 0xf4,
+ 0x69, 0x55, 0x8e, 0x81, 0xfa, 0xb2, 0x4a, 0x16,
+ 0x79, 0x05, 0xab, 0x4c, 0x78, 0x78, 0x50, 0x2a,
+ 0xd3, 0xe3, 0x8f, 0xdb, 0xe6, 0x2a, 0x41, 0x55,
+ 0x6c, 0xec, 0x37, 0x32, 0x57, 0x59, 0x53, 0x3c,
+ 0xe8, 0xf2, 0x5f, 0x36, 0x7c, 0x87, 0xbb, 0x55,
+ 0x78, 0xd6, 0x67, 0xae, 0x93, 0xf9, 0xe2, 0xfd,
+ 0x99, 0xbc, 0xbc, 0x5f, 0x2f, 0xbb, 0xa8, 0x8c,
+ 0xf6, 0x51, 0x61, 0x39, 0x42, 0x0f, 0xcf, 0xf3,
+ 0xb7, 0x36, 0x1d, 0x86, 0x32, 0x2c, 0x4b, 0xd8,
+ 0x4c, 0x82, 0xf3, 0x35, 0xab, 0xb1, 0x52, 0xc4,
+ 0xa9, 0x34, 0x11, 0x37, 0x3a, 0xaa, 0x82, 0x20
+};
+
+/*
+ * Vector 14
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 64497e5a831e4a932c09be3e5393376daa599548b816031d224bbf50a818ed23
+ * CTX 50eae7e96087c8a0db51ad290bd00c1ac1620857635bf246c176ab463be30b80
+ * CTX 8da548081ac847b158e1264be25bb0910bbc92647108089415d45fab1b3d2604
+ * CTX e8a8eff1ae4020cfa39936b66827b23f371b92200be90251e6d73c5f86de5fd4
+ * CTX a950781933d79a28272b782a2ec313efdfcc0628f43d744c2dc2ff3dcb66999b
+ * CTX 50c7ca895b0c64791eeaa5f29499fb1c026f84ce5b5c72ba1083cddb5ce45434
+ * CTX 631665c333b60b11593fb253c5179a2c8db813782a004856a1653011e93fb6d8
+ * CTX 76c18366dd8683f53412c0c180f9c848592d593f8609ca736317d356e13e2bff
+ * CTX 3a9f59cd9aeb19cd482593d8c46128bb32423b37a9adfb482b99453fbe25a41b
+ * CTX f6feb4aa0bef5ed24bf73c762978025482c13115e4015aac992e5613a3b5c2f6
+ * CTX 85b84795cb6e9b2656d8c88157e52c42f978d8634c43d06fea928f2822e465aa
+ * CTX 6576e9bf419384506cc3ce3c54ac1a6f67dc66f3b30191e698380bc999b05abc
+ * CTX e19dc0c6dcc2dd001ec535ba18deb2df1a101023108318c75dc98611a09dc48a
+ * CTX 0acdec676fabdf222f07e026f059b672b56e5cbc8e1d21bbd867dd9272120546
+ * CTX 81d70ea737134cdfce93b6f82ae22423274e58a0821cc5502e2d0ab4585e94de
+ * CTX 6975be5e0b4efce51cd3e70c25a1fbbbd609d273ad5b0d59631c531f6a0a57b9
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v14_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v14_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v14_TW[16] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v14_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v14_CTX[512] = {
+ 0x64, 0x49, 0x7e, 0x5a, 0x83, 0x1e, 0x4a, 0x93,
+ 0x2c, 0x09, 0xbe, 0x3e, 0x53, 0x93, 0x37, 0x6d,
+ 0xaa, 0x59, 0x95, 0x48, 0xb8, 0x16, 0x03, 0x1d,
+ 0x22, 0x4b, 0xbf, 0x50, 0xa8, 0x18, 0xed, 0x23,
+ 0x50, 0xea, 0xe7, 0xe9, 0x60, 0x87, 0xc8, 0xa0,
+ 0xdb, 0x51, 0xad, 0x29, 0x0b, 0xd0, 0x0c, 0x1a,
+ 0xc1, 0x62, 0x08, 0x57, 0x63, 0x5b, 0xf2, 0x46,
+ 0xc1, 0x76, 0xab, 0x46, 0x3b, 0xe3, 0x0b, 0x80,
+ 0x8d, 0xa5, 0x48, 0x08, 0x1a, 0xc8, 0x47, 0xb1,
+ 0x58, 0xe1, 0x26, 0x4b, 0xe2, 0x5b, 0xb0, 0x91,
+ 0x0b, 0xbc, 0x92, 0x64, 0x71, 0x08, 0x08, 0x94,
+ 0x15, 0xd4, 0x5f, 0xab, 0x1b, 0x3d, 0x26, 0x04,
+ 0xe8, 0xa8, 0xef, 0xf1, 0xae, 0x40, 0x20, 0xcf,
+ 0xa3, 0x99, 0x36, 0xb6, 0x68, 0x27, 0xb2, 0x3f,
+ 0x37, 0x1b, 0x92, 0x20, 0x0b, 0xe9, 0x02, 0x51,
+ 0xe6, 0xd7, 0x3c, 0x5f, 0x86, 0xde, 0x5f, 0xd4,
+ 0xa9, 0x50, 0x78, 0x19, 0x33, 0xd7, 0x9a, 0x28,
+ 0x27, 0x2b, 0x78, 0x2a, 0x2e, 0xc3, 0x13, 0xef,
+ 0xdf, 0xcc, 0x06, 0x28, 0xf4, 0x3d, 0x74, 0x4c,
+ 0x2d, 0xc2, 0xff, 0x3d, 0xcb, 0x66, 0x99, 0x9b,
+ 0x50, 0xc7, 0xca, 0x89, 0x5b, 0x0c, 0x64, 0x79,
+ 0x1e, 0xea, 0xa5, 0xf2, 0x94, 0x99, 0xfb, 0x1c,
+ 0x02, 0x6f, 0x84, 0xce, 0x5b, 0x5c, 0x72, 0xba,
+ 0x10, 0x83, 0xcd, 0xdb, 0x5c, 0xe4, 0x54, 0x34,
+ 0x63, 0x16, 0x65, 0xc3, 0x33, 0xb6, 0x0b, 0x11,
+ 0x59, 0x3f, 0xb2, 0x53, 0xc5, 0x17, 0x9a, 0x2c,
+ 0x8d, 0xb8, 0x13, 0x78, 0x2a, 0x00, 0x48, 0x56,
+ 0xa1, 0x65, 0x30, 0x11, 0xe9, 0x3f, 0xb6, 0xd8,
+ 0x76, 0xc1, 0x83, 0x66, 0xdd, 0x86, 0x83, 0xf5,
+ 0x34, 0x12, 0xc0, 0xc1, 0x80, 0xf9, 0xc8, 0x48,
+ 0x59, 0x2d, 0x59, 0x3f, 0x86, 0x09, 0xca, 0x73,
+ 0x63, 0x17, 0xd3, 0x56, 0xe1, 0x3e, 0x2b, 0xff,
+ 0x3a, 0x9f, 0x59, 0xcd, 0x9a, 0xeb, 0x19, 0xcd,
+ 0x48, 0x25, 0x93, 0xd8, 0xc4, 0x61, 0x28, 0xbb,
+ 0x32, 0x42, 0x3b, 0x37, 0xa9, 0xad, 0xfb, 0x48,
+ 0x2b, 0x99, 0x45, 0x3f, 0xbe, 0x25, 0xa4, 0x1b,
+ 0xf6, 0xfe, 0xb4, 0xaa, 0x0b, 0xef, 0x5e, 0xd2,
+ 0x4b, 0xf7, 0x3c, 0x76, 0x29, 0x78, 0x02, 0x54,
+ 0x82, 0xc1, 0x31, 0x15, 0xe4, 0x01, 0x5a, 0xac,
+ 0x99, 0x2e, 0x56, 0x13, 0xa3, 0xb5, 0xc2, 0xf6,
+ 0x85, 0xb8, 0x47, 0x95, 0xcb, 0x6e, 0x9b, 0x26,
+ 0x56, 0xd8, 0xc8, 0x81, 0x57, 0xe5, 0x2c, 0x42,
+ 0xf9, 0x78, 0xd8, 0x63, 0x4c, 0x43, 0xd0, 0x6f,
+ 0xea, 0x92, 0x8f, 0x28, 0x22, 0xe4, 0x65, 0xaa,
+ 0x65, 0x76, 0xe9, 0xbf, 0x41, 0x93, 0x84, 0x50,
+ 0x6c, 0xc3, 0xce, 0x3c, 0x54, 0xac, 0x1a, 0x6f,
+ 0x67, 0xdc, 0x66, 0xf3, 0xb3, 0x01, 0x91, 0xe6,
+ 0x98, 0x38, 0x0b, 0xc9, 0x99, 0xb0, 0x5a, 0xbc,
+ 0xe1, 0x9d, 0xc0, 0xc6, 0xdc, 0xc2, 0xdd, 0x00,
+ 0x1e, 0xc5, 0x35, 0xba, 0x18, 0xde, 0xb2, 0xdf,
+ 0x1a, 0x10, 0x10, 0x23, 0x10, 0x83, 0x18, 0xc7,
+ 0x5d, 0xc9, 0x86, 0x11, 0xa0, 0x9d, 0xc4, 0x8a,
+ 0x0a, 0xcd, 0xec, 0x67, 0x6f, 0xab, 0xdf, 0x22,
+ 0x2f, 0x07, 0xe0, 0x26, 0xf0, 0x59, 0xb6, 0x72,
+ 0xb5, 0x6e, 0x5c, 0xbc, 0x8e, 0x1d, 0x21, 0xbb,
+ 0xd8, 0x67, 0xdd, 0x92, 0x72, 0x12, 0x05, 0x46,
+ 0x81, 0xd7, 0x0e, 0xa7, 0x37, 0x13, 0x4c, 0xdf,
+ 0xce, 0x93, 0xb6, 0xf8, 0x2a, 0xe2, 0x24, 0x23,
+ 0x27, 0x4e, 0x58, 0xa0, 0x82, 0x1c, 0xc5, 0x50,
+ 0x2e, 0x2d, 0x0a, 0xb4, 0x58, 0x5e, 0x94, 0xde,
+ 0x69, 0x75, 0xbe, 0x5e, 0x0b, 0x4e, 0xfc, 0xe5,
+ 0x1c, 0xd3, 0xe7, 0x0c, 0x25, 0xa1, 0xfb, 0xbb,
+ 0xd6, 0x09, 0xd2, 0x73, 0xad, 0x5b, 0x0d, 0x59,
+ 0x63, 0x1c, 0x53, 0x1f, 0x6a, 0x0a, 0x57, 0xb9
+};
+
+//
+// Define vector of structs, with pointers to the statically defined vectors
+
+struct xts_vector vlist[NVEC] = {
+
+ // pointers to the statically defined vectors here
+
+ // Vector 10
+ {sizeof(v10_CTX), v10_key1, v10_key2, v10_TW, v10_PTX, v10_CTX}
+ ,
+ // Vector 11
+ {sizeof(v11_CTX), v11_key1, v11_key2, v11_TW, v11_PTX, v11_CTX}
+ ,
+ // Vector 12
+ {sizeof(v12_CTX), v12_key1, v12_key2, v12_TW, v12_PTX, v12_CTX}
+ ,
+ // Vector 13
+ {sizeof(v13_CTX), v13_key1, v13_key2, v13_TW, v13_PTX, v13_CTX}
+ ,
+ // Vector 14
+ {sizeof(v14_CTX), v14_key1, v14_key2, v14_TW, v14_PTX, v14_CTX}
+
+};
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm
new file mode 100644
index 000000000..416da1e7b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm
@@ -0,0 +1,78 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern XTS_AES_128_enc_sse
+extern XTS_AES_128_enc_avx
+
+extern XTS_AES_128_enc_expanded_key_sse
+extern XTS_AES_128_enc_expanded_key_avx
+
+extern XTS_AES_128_dec_sse
+extern XTS_AES_128_dec_avx
+
+extern XTS_AES_128_dec_expanded_key_sse
+extern XTS_AES_128_dec_expanded_key_avx
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern XTS_AES_128_enc_vaes
+extern XTS_AES_128_enc_expanded_key_vaes
+extern XTS_AES_128_dec_vaes
+extern XTS_AES_128_dec_expanded_key_vaes
+%endif
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate XTS_AES_128_enc, XTS_AES_128_enc_expanded_key, XTS_AES_128_dec, and XTS_AES_128_dec_expanded_key
+;;;;
+mbin_interface XTS_AES_128_enc
+mbin_dispatch_init7 XTS_AES_128_enc, XTS_AES_128_enc_sse, XTS_AES_128_enc_sse, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx, XTS_AES_128_enc_vaes
+
+mbin_interface XTS_AES_128_enc_expanded_key
+mbin_dispatch_init7 XTS_AES_128_enc_expanded_key, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_vaes
+
+mbin_interface XTS_AES_128_dec
+mbin_dispatch_init7 XTS_AES_128_dec, XTS_AES_128_dec_sse, XTS_AES_128_dec_sse, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx, XTS_AES_128_dec_vaes
+
+mbin_interface XTS_AES_128_dec_expanded_key
+mbin_dispatch_init7 XTS_AES_128_dec_expanded_key, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_vaes
+
+
+;;; func core, ver, snum
+slversion XTS_AES_128_enc, 01, 04, 0071
+slversion XTS_AES_128_enc_expanded_key, 01, 04, 0072
+slversion XTS_AES_128_dec, 01, 04, 0073
+slversion XTS_AES_128_dec_expanded_key, 01, 04, 0074
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm
new file mode 100644
index 000000000..33f376d5c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm
@@ -0,0 +1,78 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%include "reg_sizes.asm"
+
+extern XTS_AES_256_enc_sse
+extern XTS_AES_256_enc_avx
+
+extern XTS_AES_256_enc_expanded_key_sse
+extern XTS_AES_256_enc_expanded_key_avx
+
+extern XTS_AES_256_dec_sse
+extern XTS_AES_256_dec_avx
+
+extern XTS_AES_256_dec_expanded_key_sse
+extern XTS_AES_256_dec_expanded_key_avx
+
+%if (AS_FEATURE_LEVEL) >= 10
+extern XTS_AES_256_enc_vaes
+extern XTS_AES_256_enc_expanded_key_vaes
+extern XTS_AES_256_dec_vaes
+extern XTS_AES_256_dec_expanded_key_vaes
+%endif
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate XTS_AES_256_enc, XTS_AES_256_enc_expanded_key, XTS_AES_256_dec, and XTS_AES_256_dec_expanded_key
+;;;;
+mbin_interface XTS_AES_256_enc
+mbin_dispatch_init7 XTS_AES_256_enc, XTS_AES_256_enc_sse, XTS_AES_256_enc_sse, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx, XTS_AES_256_enc_vaes
+
+mbin_interface XTS_AES_256_enc_expanded_key
+mbin_dispatch_init7 XTS_AES_256_enc_expanded_key, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_vaes
+
+mbin_interface XTS_AES_256_dec
+mbin_dispatch_init7 XTS_AES_256_dec, XTS_AES_256_dec_sse, XTS_AES_256_dec_sse, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx, XTS_AES_256_dec_vaes
+
+mbin_interface XTS_AES_256_dec_expanded_key
+mbin_dispatch_init7 XTS_AES_256_dec_expanded_key, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_vaes
+
+
+;;; func core, ver, snum
+slversion XTS_AES_256_enc, 01, 04, 0076
+slversion XTS_AES_256_enc_expanded_key, 01, 04, 0077
+slversion XTS_AES_256_dec, 01, 04, 0078
+slversion XTS_AES_256_dec_expanded_key, 01, 04, 0079
diff --git a/src/crypto/isa-l/isa-l_crypto/autogen.sh b/src/crypto/isa-l/isa-l_crypto/autogen.sh
new file mode 100755
index 000000000..0a3189383
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/autogen.sh
@@ -0,0 +1,17 @@
+#!/bin/sh -e
+
+autoreconf --install --symlink -f
+
+libdir() {
+ echo $(cd $1/$(gcc -print-multi-os-directory); pwd)
+}
+
+args="--prefix=/usr --libdir=$(libdir /usr/lib)"
+
+echo
+echo "----------------------------------------------------------------"
+echo "Initialized build system. For a common configuration please run:"
+echo "----------------------------------------------------------------"
+echo
+echo "./configure $args"
+echo
diff --git a/src/crypto/isa-l/isa-l_crypto/configure.ac b/src/crypto/isa-l/isa-l_crypto/configure.ac
new file mode 100644
index 000000000..70f9cc88d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/configure.ac
@@ -0,0 +1,349 @@
+# -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ(2.69)
+AC_INIT([libisal_crypto],
+ [2.24.0],
+ [sg.support.isal@intel.com],
+ [isa-l_crypto],
+ [http://01.org/storage-acceleration-library])
+AC_CONFIG_SRCDIR([])
+AC_CONFIG_AUX_DIR([build-aux])
+AM_INIT_AUTOMAKE([
+ foreign
+ 1.11
+ -Wall
+ -Wno-portability
+ silent-rules
+ tar-pax
+ no-dist-gzip
+ dist-xz
+ subdir-objects
+])
+AM_PROG_AS
+
+AC_CANONICAL_HOST
+CPU=""
+AS_CASE([$host_cpu],
+ [x86_64], [CPU="x86_64"],
+ [amd64], [CPU="x86_64"],
+ [i?86], [CPU="x86_32"],
+ [aarch64], [CPU="aarch64"],
+ [arm64], [CPU="aarch64"],
+)
+AM_CONDITIONAL([CPU_X86_64], [test "$CPU" = "x86_64"])
+AM_CONDITIONAL([CPU_X86_32], [test "$CPU" = "x86_32"])
+AM_CONDITIONAL([CPU_AARCH64], [test "$CPU" = "aarch64"])
+AM_CONDITIONAL([CPU_UNDEFINED], [test "x$CPU" = "x"])
+
+if test "$CPU" = "x86_64"; then
+ is_x86=yes
+else
+ if test "$CPU" = "x86_32"; then
+ is_x86=yes
+ else
+ is_x86=no
+ fi
+fi
+
+# Check for programs
+AC_PROG_CC_STDC
+AC_USE_SYSTEM_EXTENSIONS
+AM_SILENT_RULES([yes])
+LT_INIT
+AC_PREFIX_DEFAULT([/usr])
+AC_PROG_SED
+AC_PROG_MKDIR_P
+
+# Options
+AC_ARG_ENABLE([debug],
+ AS_HELP_STRING([--enable-debug], [enable debug messages @<:@default=disabled@:>@]),
+ [], [enable_debug=no])
+AS_IF([test "x$enable_debug" = "xyes"], [
+ AC_DEFINE(ENABLE_DEBUG, [1], [Debug messages.])
+])
+# If this build is for x86, look for yasm and nasm
+if test x"$is_x86" = x"yes"; then
+AC_MSG_CHECKING([whether Intel CET is enabled])
+AC_TRY_COMPILE([],[
+#ifndef __CET__
+# error CET is not enabled
+#endif],
+ [AC_MSG_RESULT([yes])
+ intel_cet_enabled=yes],
+ [AC_MSG_RESULT([no])
+ intel_cet_enabled=no])
+
+
+ # Pick an assembler yasm or nasm
+ if test x"$AS" = x""; then
+ # Check for yasm and yasm features
+ yasm_feature_level=0
+ AC_CHECK_PROG(HAVE_YASM, yasm, yes, no)
+ if test "$HAVE_YASM" = "yes"; then
+ yasm_feature_level=1
+ else
+ AC_MSG_RESULT([no yasm])
+ fi
+ if test x"$yasm_feature_level" = x"1"; then
+ AC_MSG_CHECKING([for modern yasm])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vmovdqa %xmm0, %xmm1;]])])
+ if yasm -f elf64 -p gas conftest.c ; then
+ AC_MSG_RESULT([yes])
+ yasm_feature_level=4
+ else
+ AC_MSG_RESULT([no])
+ fi
+ fi
+ if test x"$yasm_feature_level" = x"4"; then
+ AC_MSG_CHECKING([for optional yasm AVX512 support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpshufb %zmm0, %zmm1, %zmm2;]])])
+ if yasm -f elf64 -p gas conftest.c 2> /dev/null; then
+ AC_MSG_RESULT([yes])
+ yasm_feature_level=6
+ else
+ AC_MSG_RESULT([no])
+ fi
+ fi
+ if test x"$yasm_feature_level" = x"6"; then
+ AC_MSG_CHECKING([for additional yasm AVX512 support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpcompressb zmm0, k1, zmm1;]])])
+ sed -i -e '/vpcompressb/!d' conftest.c
+ if yasm -f elf64 conftest.c 2> /dev/null; then
+ AC_MSG_RESULT([yes])
+ yasm_feature_level=10
+ else
+ AC_MSG_RESULT([no])
+ fi
+ fi
+
+ AC_MSG_CHECKING([for optional yasm SHA-NI support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[sha256rnds2 %xmm0,%xmm1,%xmm2;]])])
+ if yasm -f elf64 -p gas conftest.c 2> /dev/null; then
+ yasm_knows_shani=yes
+ AC_MSG_RESULT([yes])
+ else
+ AC_MSG_RESULT([no])
+ fi
+
+ # Check for nasm and nasm features
+ nasm_feature_level=0
+ AC_CHECK_PROG(HAVE_NASM, nasm, yes, no)
+ if test "$HAVE_NASM" = "yes"; then
+ nasm_feature_level=1
+ else
+ AC_MSG_RESULT([no nasm])
+ fi
+
+ if test x"$nasm_feature_level" = x"1"; then
+ AC_MSG_CHECKING([for modern nasm])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[pblendvb xmm2, xmm1;]])])
+ sed -i -e '/pblendvb/!d' conftest.c
+ if nasm -f elf64 conftest.c 2> /dev/null; then
+ AC_MSG_RESULT([yes])
+ nasm_feature_level=4
+ else
+ AC_MSG_RESULT([no])
+ fi
+ fi
+ if test x"$nasm_feature_level" = x"4"; then
+ AC_MSG_CHECKING([for optional nasm AVX512 support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vinserti32x8 zmm0, ymm1, 1;]])])
+ sed -i -e '/vinsert/!d' conftest.c
+ if nasm -f elf64 conftest.c 2> /dev/null; then
+ AC_MSG_RESULT([yes])
+ nasm_feature_level=6
+ else
+ AC_MSG_RESULT([no])
+ fi
+ fi
+ if test x"$nasm_feature_level" = x"6"; then
+ AC_MSG_CHECKING([for additional nasm AVX512 support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpcompressb zmm0 {k1}, zmm1;]])])
+ sed -i -e '/vpcompressb/!d' conftest.c
+ if nasm -f elf64 conftest.c 2> /dev/null; then
+ AC_MSG_RESULT([yes])
+ nasm_feature_level=10
+ else
+ AC_MSG_RESULT([no])
+ fi
+ fi
+
+ AC_MSG_CHECKING([for optional nasm SHA-NI support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[sha256rnds2 xmm2,xmm1,xmm0;]])])
+ sed -i -e '/sha256rnds2/!d' conftest.c
+ if nasm -f elf64 conftest.c 2> /dev/null; then
+ nasm_knows_shani=yes
+ AC_MSG_RESULT([yes])
+ else
+ AC_MSG_RESULT([no])
+ fi
+
+ if test $nasm_feature_level -ge $yasm_feature_level ; then
+ AS=nasm
+ as_feature_level=$nasm_feature_level
+ as_knows_shani=$nasm_knows_shani
+ else
+ AS=yasm
+ as_feature_level=$yasm_feature_level
+ as_knows_shani=$yasm_knows_shani
+ fi
+
+ else
+ # Check for $AS supported features
+ as_feature_level=0
+ AC_CHECK_PROG(HAVE_AS, $AS, yes, no)
+ if test "$HAVE_AS" = "yes"; then
+ as_feature_level=1
+ else
+ AC_MSG_ERROR([no $AS])
+ fi
+
+ if test x"$as_feature_level" = x"1"; then
+ AC_MSG_CHECKING([for modern $AS])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[pblendvb xmm2, xmm1;]])])
+ sed -i -e '/pblendvb/!d' conftest.c
+ if $AS -f elf64 conftest.c 2> /dev/null; then
+ AC_MSG_RESULT([yes])
+ as_feature_level=4
+ else
+ AC_MSG_RESULT([no])
+ fi
+ fi
+ if test x"$as_feature_level" = x"4"; then
+ AC_MSG_CHECKING([for optional as AVX512 support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vinserti32x8 zmm0, ymm1, 1;]])])
+ sed -i -e '/vinsert/!d' conftest.c
+ if $AS -f elf64 conftest.c 2> /dev/null; then
+ AC_MSG_RESULT([yes])
+ as_feature_level=6
+ else
+ AC_MSG_RESULT([no])
+ fi
+ fi
+ if test x"$as_feature_level" = x"6"; then
+ AC_MSG_CHECKING([for additional as AVX512 support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpcompressb zmm0, k1, zmm1;]])])
+ sed -i -e '/vpcompressb/!d' conftest.c
+ if $AS -f elf64 conftest.c 2> /dev/null; then
+ AC_MSG_RESULT([yes])
+ as_feature_level=10
+ else
+ AC_MSG_RESULT([no])
+ fi
+ fi
+
+ AC_MSG_CHECKING([for optional nasm SHA-NI support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[sha256rnds2 xmm2,xmm1,xmm0;]])])
+ sed -i -e '/sha256rnds2/!d' conftest.c
+ if $AS -f elf64 conftest.c 2> /dev/null; then
+ AC_MSG_RESULT([yes])
+ as_knows_shani=yes
+ else
+ AC_MSG_RESULT([no])
+ fi
+
+ fi
+
+ if test $as_feature_level -lt 2 ; then
+ AC_MSG_ERROR([No modern nasm or yasm found as required. Nasm should be v2.11.01 or later (v2.13 for AVX512) and yasm should be 1.2.0 or later.])
+ fi
+
+ if test x"$as_knows_shani" = x"yes"; then
+ AC_DEFINE(HAVE_AS_KNOWS_SHANI, [1], [Assembler can do SHANI.])
+ have_as_knows_shani=yes
+ else
+ AC_MSG_RESULT([Assembler does not understand SHANI opcodes. Consider upgrading for best performance.])
+ fi
+
+ case $host_os in
+ *linux*) arch=linux yasm_args="-f elf64";;
+ *darwin*) arch=darwin yasm_args="-f macho64 --prefix=_ ";;
+ *netbsd*) arch=netbsd yasm_args="-f elf64";;
+ *mingw*) arch=mingw yasm_args="-f win64";;
+ *) arch=unknown yasm_args="-f elf64";;
+ esac
+
+ # Fix for nasm missing windows features
+ if test x"$arch" = x"mingw"; then
+ AS=yasm
+ as_feature_level=$yasm_feature_level
+ if test $as_feature_level -lt 2 ; then
+ AC_MSG_ERROR([Mingw build requires Yasm 1.2.0 or later.])
+ fi
+ fi
+
+ AC_DEFINE_UNQUOTED(AS_FEATURE_LEVEL, [$as_feature_level], [Assembler feature level.])
+ if test $as_feature_level -ge 6 ; then
+ AC_DEFINE(HAVE_AS_KNOWS_AVX512, [1], [Assembler can do AVX512.])
+ have_as_knows_avx512=yes
+ else
+ AC_MSG_RESULT([Assembler does not understand AVX512 opcodes. Consider upgrading for best performance.])
+ fi
+
+ AM_CONDITIONAL(USE_YASM, test x"$AS" = x"yasm")
+ AM_CONDITIONAL(USE_NASM, test x"$AS" = x"nasm")
+ AM_CONDITIONAL(WITH_AVX512, test x"$have_as_knows_avx512" = x"yes")
+ AM_CONDITIONAL(WITH_SHANI, test x"$have_as_knows_shani" = x"yes")
+ AC_SUBST([yasm_args])
+ AM_CONDITIONAL(DARWIN, test x"$arch" = x"darwin")
+ AC_MSG_RESULT([Using $AS args target "$arch" "$yasm_args"])
+else
+ # Disable below conditionals if not x86
+ AM_CONDITIONAL(USE_YASM, test "x" = "y")
+ AM_CONDITIONAL(USE_NASM, test "x" = "y")
+ AM_CONDITIONAL(WITH_AVX512, test "x" = "y")
+ AM_CONDITIONAL(WITH_SHANI, test "x" = "y")
+ AM_CONDITIONAL(DARWIN, test "x" = "y")
+fi
+
+AM_CONDITIONAL(INTEL_CET_ENABLED, [test x"$intel_cet_enabled" = x"yes"])
+
+# Check for header files
+AC_CHECK_HEADERS([limits.h stdint.h stdlib.h string.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_TYPE_UINT16_T
+AC_TYPE_UINT32_T
+AC_TYPE_UINT64_T
+AC_TYPE_UINT8_T
+
+# Checks for library functions.
+AC_FUNC_MALLOC # Used only in tests
+AC_CHECK_FUNCS([memmove memset])
+
+my_CFLAGS="\
+-Wall \
+-Wchar-subscripts \
+-Wformat-security \
+-Wnested-externs \
+-Wpointer-arith \
+-Wshadow \
+-Wstrict-prototypes \
+-Wtype-limits \
+"
+AC_SUBST([my_CFLAGS])
+
+AC_CONFIG_FILES([\
+ Makefile\
+ libisal_crypto.pc
+])
+
+AC_OUTPUT
+AC_MSG_RESULT([
+ $PACKAGE $VERSION
+ =====
+
+ prefix: ${prefix}
+ sysconfdir: ${sysconfdir}
+ libdir: ${libdir}
+ includedir: ${includedir}
+
+ compiler: ${CC}
+ cflags: ${CFLAGS}
+ ldflags: ${LDFLAGS}
+
+ debug: ${enable_debug}
+])
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile
new file mode 100644
index 000000000..41e9e29f2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/Makefile
@@ -0,0 +1,27 @@
+
+INCLUDE = /usr/include
+CFLAGS = -O2 -I$(INCLUDE)
+LDLIBS = -lisal_crypto -lcrypto -lpthread
+test = isal_multithread_perf
+
+source += isal_multithread_perf.c
+source += md5_thread.c \
+ sha1_thread.c \
+ sha256_thread.c \
+ sha512_thread.c \
+ aes_thread.c
+
+ODIR = bin
+objects = $(addprefix $(ODIR)/, $(patsubst %.c, %.o, $(source)))
+
+$(test): $(objects)
+ gcc $? $(LDLIBS) -o $@
+
+$(ODIR): ; mkdir -p $(ODIR)
+$(objects): | $(ODIR)
+$(ODIR)/%.o: %.c
+ gcc -c $(CFLAGS) $< -o $@
+
+clean:
+ @echo Cleaning up
+ @rm -fr $(ODIR) $(test)
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt
new file mode 100644
index 000000000..60335f76c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/README.txt
@@ -0,0 +1,25 @@
+/*
+ * Saturation Test
+ * Written by Xiaodong Liu <xiaodong.liu@intel.com>
+ */
+
+This tool is used to judge the saturation performance of ISA-L's multi-buffer hash and other algorithms.
+It can be used to give a comparision between multi-buffer hash and OpenSSL's single buffer hash.
+
+Compilation:
+(Make sure isa-l_crypto library is already installed. Other libs requried are openssl and pthread.)
+make
+
+Usage: ./isal_multithread_perf -n num_threads
+ -v verbose output
+ -t time to run(secs)
+ -n number of algorithm threads
+ -l len of each buffer(KB)
+ -a memory copy before algorithm -- 1 do(default); 0 not do
+ -b memory copy after algorithm -- 1 do(default); 0 not do
+ -m method of algorithm: md5 md5_mb sha1 sha1_mb sha256 sha256_mb
+ sha512 sha512_mb cbc_128_dec cbc_192_dec cbc_256_dec xts_128_enc
+ xts_256_enc gcm_128_enc gcm_256_enc
+
+Example:
+./isal_multithread_perf -m md5 -n 10
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c
new file mode 100644
index 000000000..366fc9bcf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/aes_thread.c
@@ -0,0 +1,380 @@
+
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "isal_multithread_perf.h"
+
+struct aes_context {
+ int const bits;
+ int (*const preproc)(struct aes_context * pCtx);
+ void (*const processor)(struct aes_context * pCtx, char *plaintext,
+ char *ciphertext, uint64_t len);
+ void (*const postproc)(struct aes_context * pCtx);
+};
+
+#define rounds_buf 2 /* first one is plain text, second is cipher text */
+
+static uint64_t aes_thread_func(int32_t id, struct aes_context *pCtx)
+{
+ uint32_t i = 0, j = 0;
+ char *aes_buf[rounds_buf] = { NULL }; /* aes buf is used to do checksum compute */
+ char *carry_buf[rounds_buf] = { NULL }; /* carry buf is used to do memory movement */
+ uint64_t round = -1;
+ struct timeval start_tv, stop_tv;
+ long long secs = run_secs;
+
+ printfv("Thread %i is started\n", id);
+ /* memory allocate */
+ for (j = 0; j < rounds_buf; j++) {
+ carry_buf[j] = (char *)calloc((size_t)buflen, 1);
+ if (carry_buf[j] == NULL) {
+ printf("calloc failed test aborted\n");
+ goto out;
+ }
+
+ aes_buf[j] = (char *)calloc((size_t)buflen, 1);
+ if (aes_buf[j] == NULL) {
+ printf("calloc failed test aborted\n");
+ goto out;
+ }
+
+ /* Create the random data */
+ for (i = 0; i < buflen; i += 1024) {
+ carry_buf[j][i] = i % 256;
+ aes_buf[j][i] = i % 256;
+ }
+ }
+
+ if (pCtx->preproc(pCtx)) {
+ printf("preproc failed test aborted\n");
+ goto out;
+ }
+
+ /* Thread sync */
+ pthread_mutex_lock(&count_lock);
+ count++;
+ if (count == num_threads) {
+ pthread_cond_broadcast(&count_cond);
+ } else {
+ pthread_cond_wait(&count_cond, &count_lock);
+ }
+ pthread_mutex_unlock(&count_lock);
+
+ printfv("Thread %i is ready\n", id);
+ /* hash func starts to run */
+ round = 0;
+ gettimeofday(&start_tv, 0);
+ gettimeofday(&stop_tv, 0);
+ while (secs > (stop_tv.tv_sec - start_tv.tv_sec)) {
+ /* Pre mem-operation */
+ if (prememcpy)
+ memcpy(aes_buf[0], carry_buf[0], buflen);
+
+ /* Calculate checksum */
+ pCtx->processor(pCtx, aes_buf[0], aes_buf[1], buflen);
+
+ /* Post mem-operation */
+ if (postmemcpy)
+ memcpy(carry_buf[1], aes_buf[1], buflen);
+
+ round++;
+
+ gettimeofday(&stop_tv, 0);
+ }
+ printfv("thread %2i, aes_func rounds %ld\n", id, round);
+
+ out:
+ pCtx->postproc(pCtx);
+
+ for (j = 0; j < rounds_buf; j++) {
+ free(carry_buf[j]);
+ free(aes_buf[j]);
+ }
+
+ return round;
+}
+
+/*
+ * facilities for AES-CBC
+ */
+static unsigned char const ic[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+ 0x0e, 0x0f
+};
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ unsigned int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+/* thread functions for cbc dec */
+struct cbc_context {
+ struct aes_context base;
+ uint8_t *iv;
+ uint8_t key[CBC_256_BITS];
+ struct cbc_key_data *key_data;
+};
+
+static int cbc_dec_pre(struct aes_context *p)
+{
+ struct cbc_context *pCtx = (struct cbc_context *)p;
+ int ret;
+
+ ret = posix_memalign((void **)&pCtx->iv, 16, (CBC_IV_DATA_LEN));
+ ret |= posix_memalign((void **)&pCtx->key_data, 16, (sizeof(*pCtx->key_data)));
+
+ if ((0 != ret) || (NULL == pCtx->iv) || (NULL == pCtx->key_data))
+ return 1;
+
+ mk_rand_data(pCtx->key, sizeof(pCtx->key));
+ memcpy(pCtx->iv, ic, CBC_IV_DATA_LEN);
+ aes_cbc_precomp(pCtx->key, pCtx->base.bits, pCtx->key_data);
+
+ return 0;
+}
+
+static void cbc_dec_post(struct aes_context *p)
+{
+ struct cbc_context *pCtx = (struct cbc_context *)p;
+
+ free(pCtx->iv);
+ free(pCtx->key_data);
+
+ return;
+}
+
+static void cbc_dec_proc(struct aes_context *p, char *plaintext, char *ciphertext,
+ uint64_t len)
+{
+ struct cbc_context *pCtx = (struct cbc_context *)p;
+
+ if (pCtx->base.bits == 128)
+ aes_cbc_dec_128(ciphertext, pCtx->iv, pCtx->key_data->dec_keys, plaintext,
+ len);
+ else if (pCtx->base.bits == 192)
+ aes_cbc_dec_192(ciphertext, pCtx->iv, pCtx->key_data->dec_keys, plaintext,
+ len);
+ else if (pCtx->base.bits == 256)
+ aes_cbc_dec_256(ciphertext, pCtx->iv, pCtx->key_data->dec_keys, plaintext,
+ len);
+ else {
+ printf("unsupported cbc encryption bits %d\n", pCtx->base.bits);
+ exit(1);
+ }
+
+ return;
+}
+
+void *cbc_128_dec_func(void *arg)
+{
+ int32_t id = *((int *)arg);
+ uint64_t round = -1;
+
+ struct cbc_context ctx =
+ { {128, cbc_dec_pre, cbc_dec_proc, cbc_dec_post}, NULL, {0}, NULL };
+
+ round = aes_thread_func(id, &ctx.base);
+
+ pthread_exit((void *)round);
+}
+
+void *cbc_192_dec_func(void *arg)
+{
+ int32_t id = *((int *)arg);
+ uint64_t round = -1;
+
+ struct cbc_context ctx =
+ { {192, cbc_dec_pre, cbc_dec_proc, cbc_dec_post}, NULL, {0}, NULL };
+
+ round = aes_thread_func(id, &ctx.base);
+
+ pthread_exit((void *)round);
+}
+
+void *cbc_256_dec_func(void *arg)
+{
+ int32_t id = *((int *)arg);
+ uint64_t round = -1;
+
+ struct cbc_context ctx =
+ { {256, cbc_dec_pre, cbc_dec_proc, cbc_dec_post}, NULL, {0}, NULL };
+
+ round = aes_thread_func(id, &ctx.base);
+
+ pthread_exit((void *)round);
+}
+
+/*
+ * thread functions for xts enc
+ */
+struct xts_content {
+ struct aes_context base;
+ unsigned char key1[16 * 2];
+ unsigned char key2[16 * 2];
+ unsigned char tinit[16];
+};
+
+static int xts_enc_pre(struct aes_context *p)
+{
+ struct xts_content *pCtx = (struct xts_content *)p;
+
+ mk_rand_data(pCtx->key1, pCtx->base.bits / 8);
+ mk_rand_data(pCtx->key2, pCtx->base.bits / 8);
+ mk_rand_data(pCtx->tinit, sizeof(pCtx->tinit));
+
+ return 0;
+}
+
+static void xts_enc_post(struct aes_context *p)
+{
+ return;
+}
+
+static void xts_enc_proc(struct aes_context *p, char *plaintext, char *ciphertext,
+ uint64_t len)
+{
+ struct xts_content *pCtx = (struct xts_content *)p;
+
+ if (pCtx->base.bits == 128)
+ XTS_AES_128_enc(pCtx->key2, pCtx->key1, pCtx->tinit, len, plaintext,
+ ciphertext);
+ else if (pCtx->base.bits == 256)
+ XTS_AES_256_enc(pCtx->key2, pCtx->key1, pCtx->tinit, len, plaintext,
+ ciphertext);
+ else {
+ printf("unsupported xts encryption bits %d\n", pCtx->base.bits);
+ exit(1);
+ }
+
+ return;
+}
+
+void *xts_128_enc_func(void *arg)
+{
+ int32_t id = *((int *)arg);
+ uint64_t round = -1;
+
+ struct xts_content ctx =
+ { {128, xts_enc_pre, xts_enc_proc, xts_enc_post}, {0}, {0}, {0} };
+
+ round = aes_thread_func(id, &ctx.base);
+
+ pthread_exit((void *)round);
+}
+
+void *xts_256_enc_func(void *arg)
+{
+ int32_t id = *((int *)arg);
+ uint64_t round = -1;
+
+ struct xts_content ctx =
+ { {256, xts_enc_pre, xts_enc_proc, xts_enc_post}, {0}, {0}, {0} };
+
+ round = aes_thread_func(id, &ctx.base);
+
+ pthread_exit((void *)round);
+}
+
+/*
+ * thread functions for gcm enc
+ */
+struct gcm_context {
+ struct aes_context base;
+ uint8_t *key;
+ unsigned char *iv;
+ unsigned char *aad;
+ unsigned char *gcm_tag;
+ struct gcm_key_data gkey;
+ struct gcm_context_data gctx;
+};
+
+static int gcm_enc_pre(struct aes_context *p)
+{
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+
+ struct gcm_context *pCtx = (struct gcm_context *)p;
+
+ pCtx->key = malloc(pCtx->base.bits / 8);
+ pCtx->iv = malloc(GCM_IV_LEN);
+ pCtx->gcm_tag = malloc(MAX_TAG_LEN);
+ pCtx->aad = malloc(AAD_LENGTH);
+
+ mk_rand_data(pCtx->aad, AAD_LENGTH);
+
+ mk_rand_data(pCtx->iv, GCM_IV_LEN);
+ memcpy(&pCtx->iv[GCM_IV_END_START], IVend, sizeof(IVend));
+
+ mk_rand_data(pCtx->key, pCtx->base.bits / 8);
+ if (pCtx->base.bits == 128)
+ aes_gcm_pre_128(pCtx->key, &pCtx->gkey);
+ else
+ aes_gcm_pre_256(pCtx->key, &pCtx->gkey);
+
+ return 0;
+}
+
+static void gcm_enc_post(struct aes_context *p)
+{
+ struct gcm_context *pCtx = (struct gcm_context *)p;
+
+ free(pCtx->key);
+ free(pCtx->iv);
+ free(pCtx->gcm_tag);
+ free(pCtx->aad);
+
+ return;
+}
+
+static void gcm_enc_proc(struct aes_context *p, char *plaintext, char *ciphertext,
+ uint64_t len)
+{
+ struct gcm_context *pCtx = (struct gcm_context *)p;
+
+ if (pCtx->base.bits == 128)
+ aes_gcm_enc_128(&pCtx->gkey, &pCtx->gctx, ciphertext, plaintext, len, pCtx->iv,
+ pCtx->aad, AAD_LENGTH, pCtx->gcm_tag, MAX_TAG_LEN);
+ else if (pCtx->base.bits == 256)
+ aes_gcm_enc_256(&pCtx->gkey, &pCtx->gctx, ciphertext, plaintext, len, pCtx->iv,
+ pCtx->aad, AAD_LENGTH, pCtx->gcm_tag, MAX_TAG_LEN);
+ else {
+ printf("unsupported gcm encryption bits %d\n", pCtx->base.bits);
+ exit(1);
+ }
+
+ return;
+}
+
+void *gcm_128_enc_func(void *arg)
+{
+ int32_t id = *((int *)arg);
+ uint64_t round = -1;
+
+ struct gcm_context ctx =
+ { {128, gcm_enc_pre, gcm_enc_proc, gcm_enc_post}, NULL, NULL, NULL, NULL, {0} };
+
+ round = aes_thread_func(id, &ctx.base);
+
+ pthread_exit((void *)round);
+}
+
+void *gcm_256_enc_func(void *arg)
+{
+ int32_t id = *((int *)arg);
+ uint64_t round = -1;
+
+ struct gcm_context ctx =
+ { {256, gcm_enc_pre, gcm_enc_proc, gcm_enc_post}, NULL, NULL, NULL, NULL, {0} };
+
+ round = aes_thread_func(id, &ctx.base);
+
+ pthread_exit((void *)round);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c
new file mode 100644
index 000000000..1263fea29
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.c
@@ -0,0 +1,206 @@
+/**
+ * @file isal_multithread_perf.c
+ * @brief It is used to verify high speed algorithm saturation issue
+ * @details
+ * usage: taskset -c <cpu_index1,cpu_index2,...> isal_multithread_perf -m <algorithm name> -n <thread num>
+ * eg: taskset -c 0-9,20-29 ./isal_multithread_perf -m md5_mb -n 10
+ */
+
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdarg.h>
+
+#include "isal_multithread_perf.h"
+
+alg_method algs[] = {
+ {"md5", md5_ossl_func, MD5_MAX_LANES}
+ ,
+ {"md5_mb", md5_mb_func, MD5_MAX_LANES}
+ ,
+ {"sha1", sha1_ossl_func, SHA1_MAX_LANES}
+ ,
+ {"sha1_mb", sha1_mb_func, SHA1_MAX_LANES}
+ ,
+ {"sha256", sha256_ossl_func, SHA256_MAX_LANES}
+ ,
+ {"sha256_mb", sha256_mb_func, SHA256_MAX_LANES}
+ ,
+ {"sha512", sha512_ossl_func, SHA512_MAX_LANES}
+ ,
+ {"sha512_mb", sha512_mb_func, SHA512_MAX_LANES}
+ ,
+ {"cbc_128_dec", cbc_128_dec_func, 1}
+ ,
+ {"cbc_192_dec", cbc_192_dec_func, 1}
+ ,
+ {"cbc_256_dec", cbc_256_dec_func, 1}
+ ,
+ {"xts_128_enc", xts_128_enc_func, 1}
+ ,
+ {"xts_256_enc", xts_256_enc_func, 1}
+ ,
+ {"gcm_128_enc", gcm_128_enc_func, 1}
+ ,
+ {"gcm_256_enc", gcm_256_enc_func, 1}
+ ,
+
+ {NULL, NULL}
+};
+
+/* Global parameters*/
+long long run_secs = 10;
+uint32_t num_threads = 2;
+uint32_t buflen = 32 * 1024;
+uint32_t prememcpy = 0;
+uint32_t postmemcpy = 0;
+char *method = "md5_mb";
+
+/* Global thread sync */
+pthread_mutex_t count_lock = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t count_cond = PTHREAD_COND_INITIALIZER;
+volatile uint32_t count = 0;
+
+int verbose = 0;
+
+void usage(char *appname)
+{
+ int i = 0;
+ printf("Usage: %s -n num_threads\n", appname);
+ printf("\t-v verbose output\n"
+ "\t-t time to run(secs)\n"
+ "\t-n number of algorithm threads\n"
+ "\t-l len of each buffer(KB)\n"
+ "\t-a memory copy before algorithm -- 1 do(default); 0 not do\n"
+ "\t-b memory copy after algorithm -- 1 do(default); 0 not do\n"
+ "\t-m method of algorithm:");
+ for (i = 0; algs[i].name != NULL; i++)
+ printf(" %s", algs[i].name);
+ printf("\n");
+
+}
+
+void notice(char *appname, alg_method * alg_choose_p)
+{
+ int i = 0;
+ printf("%s starts to run\n", appname);
+ printf("\tverbose output is %d\n"
+ "\truntime is %lld(secs)\n"
+ "\tnumber of algorithm threads is %d\n"
+ "\tlen of each buffer(KB) is %d\n"
+ "\tmemory copy before algorithm is %d\n"
+ "\tmemory copy after algorithm is %d\n"
+ "\tmethod of algorithm is %s\n", verbose, run_secs, num_threads, buflen / 1024,
+ prememcpy, postmemcpy, alg_choose_p->name);
+}
+
+int main(int argc, char **argv)
+{
+ int i = 0;
+ int opt;
+ char *optstring = "t:n:m:l:a:b:v";
+ int32_t *id = NULL, ret = 0;
+ alg_method alg_choose;
+ pthread_t *clients = NULL;
+ uint64_t count = 0, sum = 0;
+ uint32_t rounds_buf;
+
+ while ((opt = getopt(argc, argv, optstring)) != -1) {
+ switch (opt) {
+ case 't':
+ run_secs = atol(optarg);
+ if (run_secs <= 0) {
+ usage(argv[0]);
+ exit(-1);
+ }
+ break;
+ case 'n':
+ num_threads = atoi(optarg);
+ if (num_threads <= 0) {
+ usage(argv[0]);
+ exit(-1);
+ }
+ break;
+ case 'm':
+ method = optarg;
+ break;
+ case 'l':
+ buflen = atoi(optarg) * 1024;
+ if (buflen <= 0) {
+ usage(argv[0]);
+ exit(-1);
+ }
+ break;
+ case 'a':
+ prememcpy = atoi(optarg);
+ if (prememcpy != 0 && prememcpy != 1) {
+ usage(argv[0]);
+ exit(-1);
+ }
+ break;
+ case 'b':
+ postmemcpy = atoi(optarg);
+ if (postmemcpy != 0 && postmemcpy != 1) {
+ usage(argv[0]);
+ exit(-1);
+ }
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ default:
+ usage(argv[0]);
+ exit(0);
+ }
+ }
+
+ /* Check method str and set algorithm_func */
+ for (i = 0; algs[i].name != NULL; i++) {
+ if (!strcmp(method, algs[i].name)) {
+ alg_choose = algs[i];
+ break;
+ }
+ }
+ if (algs[i].name == NULL) {
+ usage(argv[0]);
+ exit(-1);
+ }
+
+ notice(argv[0], &alg_choose);
+ rounds_buf = alg_choose.rounds_nbuf;
+
+ clients = (pthread_t *) calloc(num_threads + 1, sizeof(pthread_t));
+ id = (int32_t *) calloc(num_threads + 1, sizeof(int32_t));
+
+ printf("Start %i threads, use %s function\n", num_threads, alg_choose.name);
+
+ for (i = 0; i < num_threads; i++) {
+ id[i] = i;
+
+ ret =
+ pthread_create(&clients[i], NULL, alg_choose.thread_func, (void *)&id[i]);
+
+ if (ret != 0) {
+ printf("Failed to create thread %i: %s", i, strerror(ret));
+ exit(-1);
+ }
+ printfv("Thread %i is created\n", i);
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ pthread_join(clients[i], (void *)&count);
+ sum += count;
+ }
+ double loop_unit = ((double)buflen) * rounds_buf / run_secs / 1024 / 1024;
+ printf("Sum of rounds is %ld\n"
+ "Average throughput(MB/s) is %.2f\n"
+ "Total throughput(MB/s) is %.2f\n",
+ sum, (double)sum / i * loop_unit, (double)sum * loop_unit);
+
+ exit(0);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h
new file mode 100644
index 000000000..4f38705dd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/isal_multithread_perf.h
@@ -0,0 +1,52 @@
+
+#ifndef ISAL_MULTITHREAD_PERF_H_
+#define ISAL_MULTITHREAD_PERF_H_
+
+#include "isa-l_crypto.h"
+
+/* multibuffer hash */
+void *md5_ossl_func(void *arg);
+void *md5_mb_func(void *arg);
+void *sha1_ossl_func(void *arg);
+void *sha1_mb_func(void *arg);
+void *sha256_ossl_func(void *arg);
+void *sha256_mb_func(void *arg);
+void *sha512_ossl_func(void *arg);
+void *sha512_mb_func(void *arg);
+
+/* aes */
+void *cbc_128_dec_func(void *arg);
+void *cbc_192_dec_func(void *arg);
+void *cbc_256_dec_func(void *arg);
+void *xts_128_enc_func(void *arg);
+void *xts_256_enc_func(void *arg);
+#define AAD_LENGTH 16
+void *gcm_128_enc_func(void *arg);
+void *gcm_256_enc_func(void *arg);
+
+
+typedef struct {
+ char *name;
+ void *(*thread_func) (void *arg);
+ uint32_t rounds_nbuf; /* bufs number of one processing round */
+} alg_method;
+
+
+/* Global parameters*/
+extern long long run_secs;
+extern uint32_t num_threads;
+extern uint32_t buflen;
+extern uint32_t prememcpy;
+extern uint32_t postmemcpy;
+
+extern pthread_mutex_t count_lock;
+extern pthread_cond_t count_cond;
+extern volatile uint32_t count;
+
+extern int verbose;
+#define printfv(format, args...) { \
+ if (verbose) \
+ printf (format, ##args); \
+}
+
+#endif /* ISAL_MULTITHREAD_PERF_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c
new file mode 100644
index 000000000..f63b3785b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/md5_thread.c
@@ -0,0 +1,213 @@
+
+#include <pthread.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <openssl/md5.h>
+#include <openssl/sha.h>
+
+#include "isal_multithread_perf.h"
+
+#ifndef HASH_THREAD
+/* MD5 related params and structures*/
+#define DIGEST_NWORDS MD5_DIGEST_NWORDS
+#define MB_BUFS MD5_MAX_LANES
+#define HASH_CTX_MGR MD5_HASH_CTX_MGR
+#define HASH_CTX MD5_HASH_CTX
+
+#define OSSL_THREAD_FUNC md5_ossl_func
+#define OSSL_HASH_FUNC MD5
+#define MB_THREAD_FUNC md5_mb_func
+#define CTX_MGR_INIT md5_ctx_mgr_init
+#define CTX_MGR_SUBMIT md5_ctx_mgr_submit
+#define CTX_MGR_FLUSH md5_ctx_mgr_flush
+
+#define rounds_buf MD5_MAX_LANES
+
+#endif // HASH_THREAD
+
+typedef uint32_t hash_digests[DIGEST_NWORDS];
+
+void *OSSL_THREAD_FUNC(void *arg)
+{
+ int32_t id = *((int *)arg);
+ uint32_t i = 0, j = 0;
+ char *hash_buf[rounds_buf] = { NULL }; /* hash buf is used to do hash compute */
+ char *carry_buf[rounds_buf] = { NULL }; /* carry buf is used to do memory movement */
+ hash_digests digest;
+ uint64_t round = -1;
+ struct timeval start_tv, stop_tv;
+ long long secs = run_secs;
+
+ printfv("Thread %i is started\n", id);
+ /* memory allocate */
+ for (j = 0; j < rounds_buf; j++) {
+ carry_buf[j] = (char *)calloc((size_t)buflen, 1);
+ if (carry_buf[j] == NULL) {
+ printf("calloc failed test aborted\n");
+ goto out;
+ }
+
+ hash_buf[j] = (char *)calloc((size_t)buflen, 1);
+ if (hash_buf[j] == NULL) {
+ printf("calloc failed test aborted\n");
+ goto out;
+ }
+
+ /* Create the random data */
+ for (i = 0; i < buflen; i += 1024) {
+ carry_buf[j][i] = i % 256;
+ hash_buf[j][i] = i % 256;
+ }
+ }
+
+ /* Thread sync */
+ pthread_mutex_lock(&count_lock);
+ count++;
+ if (count == num_threads) {
+ pthread_cond_broadcast(&count_cond);
+ } else {
+ pthread_cond_wait(&count_cond, &count_lock);
+ }
+ pthread_mutex_unlock(&count_lock);
+
+ printfv("Thread %i is ready\n", id);
+ /* hash func starts to run */
+ round = 0;
+ gettimeofday(&start_tv, 0);
+ gettimeofday(&stop_tv, 0);
+ while (secs > (stop_tv.tv_sec - start_tv.tv_sec)) {
+ for (j = 0; j < rounds_buf; j++) {
+ /* Pre mem-operation */
+ if (prememcpy)
+ memcpy(hash_buf[j], carry_buf[j], buflen);
+
+ /* Calculate hash digest */
+ OSSL_HASH_FUNC((char *)hash_buf[j], buflen, (unsigned char *)&digest);
+
+ /* Post mem-operation */
+ if (postmemcpy)
+ memcpy(carry_buf[j], hash_buf[j], buflen);
+ }
+ round++;
+
+ gettimeofday(&stop_tv, 0);
+ }
+ printfv("thread %2i, openssl_func rounds %ld\n", id, round);
+
+ out:
+ for (j = 0; j < rounds_buf; j++) {
+ free(carry_buf[j]);
+ free(hash_buf[j]);
+ }
+
+ pthread_exit((void *)round);
+}
+
+void *MB_THREAD_FUNC(void *arg)
+{
+ int32_t id = *((int *)arg);
+ uint32_t i = 0, j = 0;
+ char *hash_buf[rounds_buf] = { NULL }; /* hash buf is used to do hash compute */
+ char *carry_buf[rounds_buf] = { NULL }; /* carry buf is used to do memory movement */
+ hash_digests *digests[rounds_buf];
+ uint64_t round = -1;
+ struct timeval start_tv, stop_tv;
+ long long secs = run_secs;
+ int ret;
+
+ HASH_CTX_MGR *mgr = NULL;
+ HASH_CTX *ctxpool = NULL, *ctx = NULL;
+
+ printfv("Thread %i is started\n", id);
+ /* Memory allocate */
+ for (j = 0; j < rounds_buf; j++) {
+ carry_buf[j] = (char *)calloc((size_t)buflen, 1);
+ if (carry_buf[j] == NULL) {
+ printf("calloc failed test aborted\n");
+ goto out;
+ }
+
+ hash_buf[j] = (char *)calloc((size_t)buflen, 1);
+ if (hash_buf[j] == NULL) {
+ printf("calloc failed test aborted\n");
+ goto out;
+ }
+
+ digests[j] = (hash_digests *) calloc(sizeof(hash_digests), 1);
+
+ /* Create the random data */
+ for (i = 0; i < buflen; i += 1024) {
+ carry_buf[j][i] = i % 256;
+ hash_buf[j][i] = i % 256;
+ }
+ }
+
+ ctxpool = (HASH_CTX *) calloc(rounds_buf, sizeof(HASH_CTX));
+ for (i = 0; i < rounds_buf; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+ ret = posix_memalign((void *)&mgr, 16, sizeof(HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ goto out;
+ }
+ CTX_MGR_INIT(mgr);
+
+ printfv("Thread %i gets to wait\n", id);
+ /* Thread sync */
+ pthread_mutex_lock(&count_lock);
+ count++;
+ if (count == num_threads) {
+ pthread_cond_broadcast(&count_cond);
+ } else {
+ pthread_cond_wait(&count_cond, &count_lock);
+ }
+ pthread_mutex_unlock(&count_lock);
+
+ printfv("Thread %i is ready\n", id);
+ /* hash func starts to run */
+ round = 0;
+ gettimeofday(&start_tv, 0);
+ gettimeofday(&stop_tv, 0);
+ while (secs > (stop_tv.tv_sec - start_tv.tv_sec)) {
+ for (j = 0; j < rounds_buf; j += MB_BUFS) {
+ for (i = 0; i < MB_BUFS; i++) {
+ /* Pre mem-operation */
+ if (prememcpy)
+ memcpy(hash_buf[j + i], carry_buf[j + i], buflen);
+
+ CTX_MGR_SUBMIT(mgr, &ctxpool[j + i], hash_buf[j + i], buflen,
+ HASH_ENTIRE);
+ }
+
+ /* Calculate hash digest */
+ while (CTX_MGR_FLUSH(mgr)) ;
+ for (i = 0; i < MB_BUFS; i++) {
+ /* Post mem-operation */
+ if (postmemcpy)
+ memcpy(carry_buf[j + i], hash_buf[j + i], buflen);
+ }
+ }
+ round++;
+
+ gettimeofday(&stop_tv, 0);
+ }
+ printfv("thread %2i, multibuffer_func rounds %ld\n", id, round);
+
+ out:
+ free(ctxpool);
+ free(mgr);
+ for (j = 0; j < rounds_buf; j++) {
+ free(carry_buf[j]);
+ free(digests[j]);
+ free(hash_buf[j]);
+ }
+
+ pthread_exit((void *)round);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c
new file mode 100644
index 000000000..5ec7eb04a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha1_thread.c
@@ -0,0 +1,20 @@
+
+#define HASH_THREAD
+/* sha1 related params and structures*/
+#define DIGEST_NWORDS SHA1_DIGEST_NWORDS
+#define MB_BUFS SHA1_MAX_LANES
+#define HASH_CTX_MGR SHA1_HASH_CTX_MGR
+#define HASH_CTX SHA1_HASH_CTX
+
+#define OSSL_THREAD_FUNC sha1_ossl_func
+#define OSSL_HASH_FUNC SHA1
+#define MB_THREAD_FUNC sha1_mb_func
+#define CTX_MGR_INIT sha1_ctx_mgr_init
+#define CTX_MGR_SUBMIT sha1_ctx_mgr_submit
+#define CTX_MGR_FLUSH sha1_ctx_mgr_flush
+
+#define rounds_buf SHA1_MAX_LANES
+
+#include "md5_thread.c"
+
+#undef HASH_THREAD
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c
new file mode 100644
index 000000000..c155c19d4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha256_thread.c
@@ -0,0 +1,20 @@
+
+#define HASH_THREAD
+/* sha256 related params and structures*/
+#define DIGEST_NWORDS SHA256_DIGEST_NWORDS
+#define MB_BUFS SHA256_MAX_LANES
+#define HASH_CTX_MGR SHA256_HASH_CTX_MGR
+#define HASH_CTX SHA256_HASH_CTX
+
+#define OSSL_THREAD_FUNC sha256_ossl_func
+#define OSSL_HASH_FUNC SHA256
+#define MB_THREAD_FUNC sha256_mb_func
+#define CTX_MGR_INIT sha256_ctx_mgr_init
+#define CTX_MGR_SUBMIT sha256_ctx_mgr_submit
+#define CTX_MGR_FLUSH sha256_ctx_mgr_flush
+
+#define rounds_buf SHA256_MAX_LANES
+
+#include "md5_thread.c"
+
+#undef HASH_THREAD
diff --git a/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c
new file mode 100644
index 000000000..5861835a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/examples/saturation_test/sha512_thread.c
@@ -0,0 +1,20 @@
+
+#define HASH_THREAD
+/* sha512 related params and structures*/
+#define DIGEST_NWORDS (SHA512_DIGEST_NWORDS * 2)
+#define MB_BUFS SHA512_MAX_LANES
+#define HASH_CTX_MGR SHA512_HASH_CTX_MGR
+#define HASH_CTX SHA512_HASH_CTX
+
+#define OSSL_THREAD_FUNC sha512_ossl_func
+#define OSSL_HASH_FUNC SHA512
+#define MB_THREAD_FUNC sha512_mb_func
+#define CTX_MGR_INIT sha512_ctx_mgr_init
+#define CTX_MGR_SUBMIT sha512_ctx_mgr_submit
+#define CTX_MGR_FLUSH sha512_ctx_mgr_flush
+
+#define rounds_buf SHA512_MAX_LANES
+
+#include "md5_thread.c"
+
+#undef HASH_THREAD
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h b/src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h
new file mode 100644
index 000000000..a8f81b232
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aarch64_multibinary.h
@@ -0,0 +1,301 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef __AARCH64_MULTIBINARY_H__
+#define __AARCH64_MULTIBINARY_H__
+#ifndef __aarch64__
+#error "This file is for aarch64 only"
+#endif
+#include <asm/hwcap.h>
+#ifdef __ASSEMBLY__
+/**
+ * # mbin_interface : the wrapper layer for isal-l api
+ *
+ * ## references:
+ * * https://sourceware.org/git/gitweb.cgi?p=glibc.git;a=blob;f=sysdeps/aarch64/dl-trampoline.S
+ * * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+ * * https://static.docs.arm.com/ihi0057/b/IHI0057B_aadwarf64.pdf?_ga=2.80574487.1870739014.1564969896-1634778941.1548729310
+ *
+ * ## Usage:
+ * 1. Define dispather function
+ * 2. name must be \name\()_dispatcher
+ * 3. Prototype should be *"void * \name\()_dispatcher"*
+ * 4. The dispather should return the right function pointer , revision and a string information .
+ **/
+.macro mbin_interface name:req
+ .extern \name\()_dispatcher
+ .section .data
+ .balign 8
+ .global \name\()_dispatcher_info
+ .type \name\()_dispatcher_info,%object
+
+ \name\()_dispatcher_info:
+ .quad \name\()_mbinit //func_entry
+
+ .size \name\()_dispatcher_info,. - \name\()_dispatcher_info
+
+ .balign 8
+ .text
+ \name\()_mbinit:
+ //save lp fp, sub sp
+ .cfi_startproc
+ stp x29, x30, [sp, -224]!
+
+ //add cfi directive to avoid GDB bt cmds error
+ //set cfi(Call Frame Information)
+ .cfi_def_cfa_offset 224
+ .cfi_offset 29, -224
+ .cfi_offset 30, -216
+
+ //save parameter/result/indirect result registers
+ stp x8, x9, [sp, 16]
+ .cfi_offset 8, -208
+ .cfi_offset 9, -200
+ stp x0, x1, [sp, 32]
+ .cfi_offset 0, -192
+ .cfi_offset 1, -184
+ stp x2, x3, [sp, 48]
+ .cfi_offset 2, -176
+ .cfi_offset 3, -168
+ stp x4, x5, [sp, 64]
+ .cfi_offset 4, -160
+ .cfi_offset 5, -152
+ stp x6, x7, [sp, 80]
+ .cfi_offset 6, -144
+ .cfi_offset 7, -136
+ stp q0, q1, [sp, 96]
+ .cfi_offset 64, -128
+ .cfi_offset 65, -112
+ stp q2, q3, [sp, 128]
+ .cfi_offset 66, -96
+ .cfi_offset 67, -80
+ stp q4, q5, [sp, 160]
+ .cfi_offset 68, -64
+ .cfi_offset 69, -48
+ stp q6, q7, [sp, 192]
+ .cfi_offset 70, -32
+ .cfi_offset 71, -16
+
+ /**
+ * The dispatcher functions have the following prototype:
+ * void * function_dispatcher(void)
+ * As the dispatcher is returning a struct, by the AAPCS,
+ */
+
+
+ bl \name\()_dispatcher
+ //restore temp/indirect result registers
+ ldp x8, x9, [sp, 16]
+ .cfi_restore 8
+ .cfi_restore 9
+
+ // save function entry
+ str x0, [x9]
+
+ //restore parameter/result registers
+ ldp x0, x1, [sp, 32]
+ .cfi_restore 0
+ .cfi_restore 1
+ ldp x2, x3, [sp, 48]
+ .cfi_restore 2
+ .cfi_restore 3
+ ldp x4, x5, [sp, 64]
+ .cfi_restore 4
+ .cfi_restore 5
+ ldp x6, x7, [sp, 80]
+ .cfi_restore 6
+ .cfi_restore 7
+ ldp q0, q1, [sp, 96]
+ .cfi_restore 64
+ .cfi_restore 65
+ ldp q2, q3, [sp, 128]
+ .cfi_restore 66
+ .cfi_restore 67
+ ldp q4, q5, [sp, 160]
+ .cfi_restore 68
+ .cfi_restore 69
+ ldp q6, q7, [sp, 192]
+ .cfi_restore 70
+ .cfi_restore 71
+ //save lp fp and sp
+ ldp x29, x30, [sp], 224
+ //restore cfi setting
+ .cfi_restore 30
+ .cfi_restore 29
+ .cfi_def_cfa_offset 0
+ .cfi_endproc
+
+ .global \name
+ .type \name,%function
+ .align 2
+ \name\():
+ adrp x9, :got:\name\()_dispatcher_info
+ ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info]
+ ldr x10,[x9]
+ br x10
+ .size \name,. - \name
+
+.endm
+
+/**
+ * mbin_interface_base is used for the interfaces which have only
+ * noarch implementation
+ */
+.macro mbin_interface_base name:req, base:req
+ .extern \base
+ .section .data
+ .balign 8
+ .global \name\()_dispatcher_info
+ .type \name\()_dispatcher_info,%object
+
+ \name\()_dispatcher_info:
+ .quad \base //func_entry
+ .size \name\()_dispatcher_info,. - \name\()_dispatcher_info
+
+ .balign 8
+ .text
+ .global \name
+ .type \name,%function
+ .align 2
+ \name\():
+ adrp x9, :got:\name\()_dispatcher_info
+ ldr x9, [x9, #:got_lo12:\name\()_dispatcher_info]
+ ldr x10,[x9]
+ br x10
+ .size \name,. - \name
+
+.endm
+
+#else /* __ASSEMBLY__ */
+#include <sys/auxv.h>
+
+
+
+#define DEFINE_INTERFACE_DISPATCHER(name) \
+ void * name##_dispatcher(void)
+
+#define PROVIDER_BASIC(name) \
+ PROVIDER_INFO(name##_base)
+
+#define DO_DIGNOSTIC(x) _Pragma GCC diagnostic ignored "-W"#x
+#define DO_PRAGMA(x) _Pragma (#x)
+#define DIGNOSTIC_IGNORE(x) DO_PRAGMA(GCC diagnostic ignored #x)
+#define DIGNOSTIC_PUSH() DO_PRAGMA(GCC diagnostic push)
+#define DIGNOSTIC_POP() DO_PRAGMA(GCC diagnostic pop)
+
+
+#define PROVIDER_INFO(_func_entry) \
+ ({ DIGNOSTIC_PUSH() \
+ DIGNOSTIC_IGNORE(-Wnested-externs) \
+ extern void _func_entry(void); \
+ DIGNOSTIC_POP() \
+ _func_entry; \
+ })
+
+/**
+ * Micro-Architector definitions
+ * Reference: https://developer.arm.com/docs/ddi0595/f/aarch64-system-registers/midr_el1
+ */
+
+#define CPU_IMPLEMENTER_RESERVE 0x00
+#define CPU_IMPLEMENTER_ARM 0x41
+
+
+#define CPU_PART_CORTEX_A57 0xD07
+#define CPU_PART_CORTEX_A72 0xD08
+#define CPU_PART_NEOVERSE_N1 0xD0C
+
+#define MICRO_ARCH_ID(imp,part) \
+ (((CPU_IMPLEMENTER_##imp&0xff)<<24)|((CPU_PART_##part&0xfff)<<4))
+
+#ifndef HWCAP_CPUID
+#define HWCAP_CPUID (1<<11)
+#endif
+
+/**
+ * @brief get_micro_arch_id
+ * read micro-architector register instruction if possible.This function
+ * provides microarchitecture information and make microarchitecture optimization
+ * possible. It will trap into kernel due to mrs instruction. So it should
+ * be called only in dispatcher, that will be called only once in program
+ * lifecycle. And HWCAP must be match,That will make sure there are no
+ * illegal instruction errors.
+ *
+ * NOTICE:
+ * - HWCAP_CPUID should be available. Otherwise it returns zero
+ * - It MUST be called inside dispather.
+ * - It MUST meet the HWCAP requirements
+ *
+ * Example:
+ * DEFINE_INTERFACE_DISPATCHER(crc32_iscsi)
+ * {
+ * unsigned long auxval = getauxval(AT_HWCAP);
+ * // MUST do the judgement is MUST.
+ * if ((HWCAP_CRC32 | HWCAP_PMULL) == (auxval & (HWCAP_CRC32 | HWCAP_PMULL))) {
+ * switch (get_micro_arch_id()) {
+ * case MICRO_ARCH_ID(ARM, CORTEX_A57):
+ * return PROVIDER_INFO(crc32_pmull_crc_for_a57);
+ * case MICRO_ARCH_ID(ARM, CORTEX_A72):
+ * return PROVIDER_INFO(crc32_pmull_crc_for_a72);
+ * case MICRO_ARCH_ID(ARM, NEOVERSE_N1):
+ * return PROVIDER_INFO(crc32_pmull_crc_for_n1);
+ * case default:
+ * return PROVIDER_INFO(crc32_pmull_crc_for_others);
+ * }
+ * }
+ * return PROVIDER_BASIC(crc32_iscsi);
+ * }
+ * KNOWN ISSUE:
+ * On a heterogeneous system (big.LITTLE), it will work but the performance
+ * might not be the best one as expected.
+ *
+ * If this function is called on the big core, it will return the function
+ * optimized for the big core.
+ *
+ * If execution is then scheduled to the little core. It will still work (1),
+ * but the function won't be optimized for the little core, thus the performance
+ * won't be as expected.
+ *
+ * References:
+ * - [CPU Feature detection](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Documentation/arm64/cpu-feature-registers.rst?h=v5.5)
+ *
+ */
+static inline uint32_t get_micro_arch_id(void)
+{
+ uint32_t id=CPU_IMPLEMENTER_RESERVE;
+ if ((getauxval(AT_HWCAP) & HWCAP_CPUID)) {
+
+ asm("mrs %0, MIDR_EL1 " : "=r" (id));
+ }
+ return id&0xff00fff0;
+}
+
+
+
+#endif /* __ASSEMBLY__ */
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h b/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h
new file mode 100644
index 000000000..aaf87ada1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h
@@ -0,0 +1,165 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/**
+ * @file aes_cbc.h
+ * @brief AES CBC encryption/decryption function prototypes.
+ *
+ */
+#ifndef _AES_CBC_h
+#define _AES_CBC_h
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+
+#endif
+
+typedef enum cbc_key_size { CBC_128_BITS = 16, CBC_192_BITS = 24, CBC_256_BITS = 32} cbc_key_size;
+#define CBC_ROUND_KEY_LEN (16)
+#define CBC_128_KEY_ROUNDS (10+1) /*expanded key holds 10 key rounds plus original key*/
+#define CBC_192_KEY_ROUNDS (12+1) /*expanded key holds 12 key rounds plus original key*/
+#define CBC_256_KEY_ROUNDS (14+1) /*expanded key holds 14 key rounds plus original key*/
+#define CBC_MAX_KEYS_SIZE (CBC_ROUND_KEY_LEN * CBC_256_KEY_ROUNDS)
+
+#define CBC_IV_DATA_LEN (16)
+
+/** @brief holds intermediate key data used in encryption/decryption
+ *
+ */
+struct cbc_key_data { // must be 16 byte aligned
+ uint8_t enc_keys[CBC_MAX_KEYS_SIZE];
+ uint8_t dec_keys[CBC_MAX_KEYS_SIZE];
+};
+
+/** @brief CBC-AES key pre-computation done once for a key
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ * arg 1: in: pointer to key
+ * arg 2: OUT: pointer to a key expanded data
+ */
+int aes_cbc_precomp(
+ uint8_t *key,
+ int key_size,
+ struct cbc_key_data *keys_blk
+);
+
+/** @brief CBC-AES 128 bit key Decryption
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ * arg 1: in: pointer to input (cipher text)
+ * arg 2: IV: pointer to IV, Must be 16 bytes aligned to a 16 byte boundary
+ * arg 3: keys: pointer to keys, Must be on a 16 byte boundary and length of key size * key rounds
+ * arg 4: OUT: pointer to output (plain text ... in-place allowed)
+ * arg 5: len_bytes: length in bytes (multiple of 16)
+ */
+void aes_cbc_dec_128(
+ void *in, //!< Input cipher text
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+ void *out, //!< Output plain text
+ uint64_t len_bytes //!< Must be a multiple of 16 bytes
+ );
+
+/** @brief CBC-AES 192 bit key Decryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+void aes_cbc_dec_192(
+ void *in, //!< Input cipher text
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+ void *out, //!< Output plain text
+ uint64_t len_bytes //!< Must be a multiple of 16 bytes
+ );
+
+/** @brief CBC-AES 256 bit key Decryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+void aes_cbc_dec_256(
+ void *in, //!< Input cipher text
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+ void *out, //!< Output plain text
+ uint64_t len_bytes //!< Must be a multiple of 16 bytes
+ );
+
+/** @brief CBC-AES 128 bit key Encryption
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ * arg 1: in: pointer to input (plain text)
+ * arg 2: IV: pointer to IV, Must be 16 bytes aligned to a 16 byte boundary
+ * arg 3: keys: pointer to keys, Must be on a 16 byte boundary and length of key size * key rounds
+ * arg 4: OUT: pointer to output (cipher text ... in-place allowed)
+ * arg 5: len_bytes: length in bytes (multiple of 16)
+ */
+int aes_cbc_enc_128(
+ void *in, //!< Input plain text
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data
+ void *out, //!< Output cipher text
+ uint64_t len_bytes //!< Must be a multiple of 16 bytes
+ );
+/** @brief CBC-AES 192 bit key Encryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+int aes_cbc_enc_192(
+ void *in, //!< Input plain text
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data
+ void *out, //!< Output cipher text
+ uint64_t len_bytes //!< Must be a multiple of 16 bytes
+ );
+
+/** @brief CBC-AES 256 bit key Encryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+int aes_cbc_enc_256(
+ void *in, //!< Input plain text
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data
+ void *out, //!< Output cipher text
+ uint64_t len_bytes //!< Must be a multiple of 16 bytes
+ );
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+#endif //ifndef _AES_CBC_h
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h b/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h
new file mode 100644
index 000000000..b407b7f6b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h
@@ -0,0 +1,613 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/**
+ * @file aes_gcm.h
+ * @brief AES GCM encryption/decryption function prototypes.
+ *
+ * At build time there is an option to use non-temporal loads and stores
+ * selected by defining the compile time option NT_LDST. The use of this option
+ * places the following restriction on the gcm encryption functions:
+ *
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ *
+ * - When using the streaming API, all partial input buffers must be a multiple
+ * of 64 bytes long except for the last input buffer.
+ *
+ * - In-place encryption/decryption is not recommended.
+ *
+ */
+
+/*
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+; Assumptions: Support for SSE4.1 or greater, AVX or AVX2
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+ */
+#ifndef _AES_GCM_h
+#define _AES_GCM_h
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+#define MAX_TAG_LEN (16)
+//
+// IV data is limited to 16 bytes. The last DWORD (4 bytes) must be 0x1
+//
+#define GCM_IV_LEN (16)
+#define GCM_IV_DATA_LEN (12)
+#define GCM_IV_END_MARK {0x00, 0x00, 0x00, 0x01};
+#define GCM_IV_END_START (12)
+
+#define LONGEST_TESTED_AAD_LENGTH (2* 1024)
+
+// Key lengths of 128 and 256 supported
+#define GCM_128_KEY_LEN (16)
+#define GCM_256_KEY_LEN (32)
+
+#define GCM_BLOCK_LEN 16
+#define GCM_ENC_KEY_LEN 16
+#define GCM_KEY_SETS (15) /*exp key + 14 exp round keys*/
+
+/**
+ * @brief holds intermediate key data needed to improve performance
+ *
+ * gcm_data hold internal key information used by gcm128 and gcm256.
+ */
+struct gcm_data {
+ uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+ uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // store HashKey <<1 mod poly here
+ uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // store HashKey^2 <<1 mod poly here
+ uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // store HashKey^3 <<1 mod poly here
+ uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // store HashKey^4 <<1 mod poly here
+ uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // store HashKey^5 <<1 mod poly here
+ uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // store HashKey^6 <<1 mod poly here
+ uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // store HashKey^7 <<1 mod poly here
+ uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // store HashKey^8 <<1 mod poly here
+ uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+ // init, update and finalize context data
+ uint8_t aad_hash[GCM_BLOCK_LEN];
+ uint64_t aad_length;
+ uint64_t in_length;
+ uint8_t partial_block_enc_key[GCM_BLOCK_LEN];
+ uint8_t orig_IV[GCM_BLOCK_LEN];
+ uint8_t current_counter[GCM_BLOCK_LEN];
+ uint64_t partial_block_length;
+};
+
+/**
+ * @brief holds intermediate key data needed to improve performance
+ *
+ * gcm_key_data hold internal key information used by gcm128, gcm192 and gcm256.
+ */
+#ifdef __WIN32
+__declspec(align(16))
+#endif /* WIN32 */
+struct gcm_key_data {
+ uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+ uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // store HashKey <<1 mod poly here
+ uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // store HashKey^2 <<1 mod poly here
+ uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // store HashKey^3 <<1 mod poly here
+ uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // store HashKey^4 <<1 mod poly here
+ uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // store HashKey^5 <<1 mod poly here
+ uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // store HashKey^6 <<1 mod poly here
+ uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // store HashKey^7 <<1 mod poly here
+ uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // store HashKey^8 <<1 mod poly here
+ uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits
+ uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; // and Low 64b of HashKey^n <<1 mod poly
+ uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; // here (for Karatsuba purposes)
+ uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN];
+ uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN];
+ uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN];
+ uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN];
+ uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN];
+#ifdef GCM_BIG_DATA
+ uint8_t shifted_hkey_n_k[GCM_ENC_KEY_LEN * (128 - 16)]; // Big data version needs 128
+#else
+ uint8_t shifted_hkey_n_k[GCM_ENC_KEY_LEN * (48 - 16)]; // Others vaes version needs 48
+#endif
+}
+#if defined (__unix__) || (__APPLE__) || (__MINGW32__)
+ __attribute__ ((aligned (16)));
+#else
+ ;
+#endif
+
+/**
+ * @brief holds GCM operation context
+ */
+struct gcm_context_data {
+ // init, update and finalize context data
+ uint8_t aad_hash[GCM_BLOCK_LEN];
+ uint64_t aad_length;
+ uint64_t in_length;
+ uint8_t partial_block_enc_key[GCM_BLOCK_LEN];
+ uint8_t orig_IV[GCM_BLOCK_LEN];
+ uint8_t current_counter[GCM_BLOCK_LEN];
+ uint64_t partial_block_length;
+};
+
+/* ------------------ New interface for separate expanded keys ------------ */
+
+/**
+ * @brief GCM-AES Encryption using 128 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed
+ uint8_t const *in, //!< Plaintext input
+ uint64_t len, //!< Length of data in Bytes for encryption
+ uint8_t *iv, //!< iv pointer to 12 byte IV structure.
+ //!< Internally, library concates 0x00000001 value to it.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len, //!< Length of AAD
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+/**
+ * @brief GCM-AES Encryption using 256 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed
+ uint8_t const *in, //!< Plaintext input
+ uint64_t len, //!< Length of data in Bytes for encryption
+ uint8_t *iv, //!< iv pointer to 12 byte IV structure.
+ //!< Internally, library concates 0x00000001 value to it.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len, //!< Length of AAD
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+
+/**
+ * @brief GCM-AES Decryption using 128 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed
+ uint8_t const *in, //!< Ciphertext input
+ uint64_t len, //!< Length of data in Bytes for decryption
+ uint8_t *iv, //!< iv pointer to 12 byte IV structure.
+ //!< Internally, library concates 0x00000001 value to it.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len, //!< Length of AAD
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+/**
+ * @brief GCM-AES Decryption using 128 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed
+ uint8_t const *in, //!< Ciphertext input
+ uint64_t len, //!< Length of data in Bytes for decryption
+ uint8_t *iv, //!< iv pointer to 12 byte IV structure.
+ //!< Internally, library concates 0x00000001 value to it.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len, //!< Length of AAD
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+
+/**
+ * @brief Start a AES-GCM Encryption message 128 bit key
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_init_128(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *iv, //!< Pointer to 12 byte IV structure
+ //!< Internally, library concates 0x00000001 value to it
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len //!< Length of AAD
+ );
+
+/**
+ * @brief Start a AES-GCM Encryption message 256 bit key
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_init_256(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *iv, //!< Pointer to 12 byte IV structure
+ //!< Internally, library concates 0x00000001 value to it
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len //!< Length of AAD
+ );
+
+/**
+ * @brief Encrypt a block of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128_update(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed.
+ const uint8_t *in, //!< Plaintext input
+ uint64_t len //!< Length of data in Bytes for encryption
+ );
+
+/**
+ * @brief Encrypt a block of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256_update(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed.
+ const uint8_t *in, //!< Plaintext input
+ uint64_t len //!< Length of data in Bytes for encryption
+ );
+
+/**
+ * @brief Decrypt a block of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128_update(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed.
+ const uint8_t *in, //!< Ciphertext input
+ uint64_t len //!< Length of data in Bytes for decryption
+ );
+
+/**
+ * @brief Decrypt a block of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256_update(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed.
+ const uint8_t *in, //!< Ciphertext input
+ uint64_t len //!< Length of data in Bytes for decryption
+ );
+
+/**
+ * @brief End encryption of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128_finalize(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+/**
+ * @brief End encryption of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256_finalize(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+/**
+ * @brief End decryption of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128_finalize(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+/**
+ * @brief End decryption of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256_finalize(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+/**
+ * @brief Pre-processes GCM key data 128 bit
+ *
+ * Prefills the gcm key data with key values for each round and
+ * the initial sub hash key for tag encoding
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_pre_128(
+ const void *key, //!< Pointer to key data
+ struct gcm_key_data *key_data //!< GCM expanded key data
+ );
+
+/**
+ * @brief Pre-processes GCM key data 128 bit
+ *
+ * Prefills the gcm key data with key values for each round and
+ * the initial sub hash key for tag encoding
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_pre_256(
+ const void *key, //!< Pointer to key data
+ struct gcm_key_data *key_data //!< GCM expanded key data
+ );
+
+
+
+/* ---- NT versions ---- */
+/**
+ * @brief GCM-AES Encryption using 128 bit keys, Non-temporal data
+ *
+ * Non-temporal version of encrypt has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128_nt(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed
+ uint8_t const *in, //!< Plaintext input
+ uint64_t len, //!< Length of data in Bytes for encryption
+ uint8_t *iv, //!< iv pointer to 12 byte IV structure.
+ //!< Internally, library concates 0x00000001 value to it.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len, //!< Length of AAD
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+/**
+ * @brief GCM-AES Encryption using 256 bit keys, Non-temporal data
+ *
+ * Non-temporal version of encrypt has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256_nt(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed
+ uint8_t const *in, //!< Plaintext input
+ uint64_t len, //!< Length of data in Bytes for encryption
+ uint8_t *iv, //!< iv pointer to 12 byte IV structure.
+ //!< Internally, library concates 0x00000001 value to it.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len, //!< Length of AAD
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+
+/**
+ * @brief GCM-AES Decryption using 128 bit keys, Non-temporal data
+ *
+ * Non-temporal version of decrypt has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128_nt(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed
+ uint8_t const *in, //!< Ciphertext input
+ uint64_t len, //!< Length of data in Bytes for decryption
+ uint8_t *iv, //!< iv pointer to 12 byte IV structure.
+ //!< Internally, library concates 0x00000001 value to it.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len, //!< Length of AAD
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+/**
+ * @brief GCM-AES Decryption using 128 bit keys, Non-temporal data
+ *
+ * Non-temporal version of decrypt has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256_nt(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed
+ uint8_t const *in, //!< Ciphertext input
+ uint64_t len, //!< Length of data in Bytes for decryption
+ uint8_t *iv, //!< iv pointer to 12 byte IV structure.
+ //!< Internally, library concates 0x00000001 value to it.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD)
+ uint64_t aad_len, //!< Length of AAD
+ uint8_t *auth_tag, //!< Authenticated Tag output
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes).
+ //!< Valid values are 16 (most likely), 12 or 8
+ );
+
+
+/**
+ * @brief Encrypt a block of a AES-128-GCM Encryption message, Non-temporal data
+ *
+ * Non-temporal version of encrypt update has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - All partial input buffers must be a multiple of 64 bytes long except for
+ * the last input buffer.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_128_update_nt(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed.
+ const uint8_t *in, //!< Plaintext input
+ uint64_t len //!< Length of data in Bytes for encryption
+ );
+
+/**
+ * @brief Encrypt a block of a AES-256-GCM Encryption message, Non-temporal data
+ *
+ * Non-temporal version of encrypt update has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - All partial input buffers must be a multiple of 64 bytes long except for
+ * the last input buffer.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_enc_256_update_nt(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed.
+ const uint8_t *in, //!< Plaintext input
+ uint64_t len //!< Length of data in Bytes for encryption
+ );
+
+/**
+ * @brief Decrypt a block of a AES-128-GCM Encryption message, Non-temporal data
+ *
+ * Non-temporal version of decrypt update has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - All partial input buffers must be a multiple of 64 bytes long except for
+ * the last input buffer.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_128_update_nt(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed.
+ const uint8_t *in, //!< Ciphertext input
+ uint64_t len //!< Length of data in Bytes for decryption
+ );
+
+/**
+ * @brief Decrypt a block of a AES-256-GCM Encryption message, Non-temporal data
+ *
+ * Non-temporal version of decrypt update has additional restrictions:
+ * - The plaintext and cyphertext buffers must be aligned on a 64 byte boundary.
+ * - All partial input buffers must be a multiple of 64 bytes long except for
+ * the last input buffer.
+ * - In-place encryption/decryption is not recommended. Performance can be slow.
+ *
+ * @requires SSE4.1 and AESNI
+ */
+void aes_gcm_dec_256_update_nt(
+ const struct gcm_key_data *key_data, //!< GCM expanded key data
+ struct gcm_context_data *context_data, //!< GCM operation context data
+ uint8_t *out, //!< Plaintext output. Decrypt in-place is allowed.
+ const uint8_t *in, //!< Ciphertext input
+ uint64_t len //!< Length of data in Bytes for decryption
+ );
+
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+#endif //ifndef _AES_GCM_h
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h b/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h
new file mode 100644
index 000000000..6ecded301
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h
@@ -0,0 +1,76 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _KEYEXP_128_H
+#define _KEYEXP_128_H
+
+/**
+ * @file aes_keyexp.h
+ * @brief AES key expansion functions
+ *
+ * This defines the interface to key expansion functions.
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @brief AES key expansion 128 bit
+* @requires SSE4.1
+*/
+void aes_keyexp_128(
+ const uint8_t *key, //!< input key for AES-128, 16 bytes
+ uint8_t *exp_key_enc, //!< expanded encryption keys, 16*11 bytes
+ uint8_t *exp_key_dec //!< expanded decryption keys, 16*11 bytes
+ );
+
+/** @brief AES key expansion 192 bit
+* @requires SSE4.1
+*/
+void aes_keyexp_192(
+ const uint8_t *key, //!< input key for AES-192, 16*1.5 bytes
+ uint8_t *exp_key_enc, //!< expanded encryption keys, 16*13 bytes
+ uint8_t *exp_key_dec //!< expanded decryption keys, 16*13 bytes
+ );
+
+/** @brief AES key expansion 256 bit
+* @requires SSE4.1
+*/
+void aes_keyexp_256(
+ const uint8_t *key, //!< input key for AES-256, 16*2 bytes
+ uint8_t *exp_key_enc, //!< expanded encryption keys, 16*15 bytes
+ uint8_t *exp_key_dec //!< expanded decryption keys, 16*15 bytes
+ );
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+#endif //ifndef _KEYEXP_128_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h b/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h
new file mode 100644
index 000000000..2021284f5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h
@@ -0,0 +1,214 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _AES_XTS_H
+#define _AES_XTS_H
+
+/**
+ * @file aes_xts.h
+ * @brief AES XTS encryption function prototypes.
+ *
+ * This defines the interface to optimized AES XTS functions
+ *
+ * <b>Pre-expanded keys</b>
+ *
+ * For key encryption, pre-expanded keys are stored in the order that they will be
+ * used. As an example, if Key[0] is the 128-bit initial key used for an AES-128
+ * encryption, the rest of the keys are stored as follows:
+ *
+ * <ul>
+ * <li> Key[0] : Initial encryption key
+ * <li> Key[1] : Round 1 encryption key
+ * <li> Key[2] : Round 2 encryption key
+ * <li> ...
+ * <li> Key[10] : Round 10 encryption key
+ * </ul>
+ *
+ * For decryption, the order of keys is reversed. However, we apply the
+ * necessary aesimc instructions before storing the expanded keys. For the same key
+ * used above, the pre-expanded keys will be stored as follows:
+ *
+ * <ul>
+ * <li> Key[0] : Round 10 encryption key
+ * <li> Key[1] : aesimc(Round 9 encryption key)
+ * <li> Key[2] : aesimc(Round 8 encryption key)
+ * <li> ...
+ * <li> Key[9] : aesimc(Round 1 encryption key)
+ * <li> Key[10] : Initial encryption key
+ * </ul>
+ *
+ * <b>Note:</b> The expanded key decryption requires a decryption key only for the block
+ * decryption step. The tweak step in the expanded key decryption requires the same expanded
+ * encryption key that is used in the expanded key encryption.
+ *
+ * <b>Input and Output Buffers </b>
+ *
+ * The input and output buffers can be overlapping as long as the output buffer
+ * pointer is not less than the input buffer pointer. If the two pointers are the
+ * same, then encryption/decryption will occur in-place.
+ *
+ * <b>Data Length</b>
+ *
+ * <ul>
+ * <li> The functions support data length of any bytes greater than or equal to 16 bytes.
+ * <li> Data length is a 64-bit value, which makes the largest possible data length
+ * 2^64 - 1 bytes.
+ * <li> For data lengths from 0 to 15 bytes, the functions return without any error
+ * codes, without reading or writing any data.
+ * <li> The functions only support byte lengths, not bits.
+ * </ul>
+ *
+ * <b>Initial Tweak</b>
+ *
+ * The functions accept a 128-bit initial tweak value. The user is responsible for
+ * padding the initial tweak value to this length.
+ *
+ * <b>Data Alignment</b>
+ *
+ * The input and output buffers, keys, pre-expanded keys and initial tweak value
+ * are not required to be aligned to 16 bytes, any alignment works.
+ *
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @brief XTS-AES-128 Encryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_enc(
+ uint8_t *k2, //!< key used for tweaking, 16 bytes
+ uint8_t *k1, //!< key used for encryption of tweaked plaintext, 16 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *pt, //!< plaintext sector input data
+ uint8_t *ct //!< ciphertext sector output data
+ );
+
+/** @brief XTS-AES-128 Encryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_enc_expanded_key(
+ uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes
+ uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*11 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *pt, //!< plaintext sector input data
+ uint8_t *ct //!< ciphertext sector output data
+ );
+
+/** @brief XTS-AES-128 Decryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_dec(
+ uint8_t *k2, //!< key used for tweaking, 16 bytes
+ uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *ct, //!< ciphertext sector input data
+ uint8_t *pt //!< plaintext sector output data
+ );
+
+/** @brief XTS-AES-128 Decryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_dec_expanded_key(
+ uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes - encryption key is used
+ uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*11 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *ct, //!< ciphertext sector input data
+ uint8_t *pt //!< plaintext sector output data
+ );
+
+/** @brief XTS-AES-256 Encryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_enc(
+ uint8_t *k2, //!< key used for tweaking, 16*2 bytes
+ uint8_t *k1, //!< key used for encryption of tweaked plaintext, 16*2 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *pt, //!< plaintext sector input data
+ uint8_t *ct //!< ciphertext sector output data
+ );
+
+/** @brief XTS-AES-256 Encryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_enc_expanded_key(
+ uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes
+ uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*15 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *pt, //!< plaintext sector input data
+ uint8_t *ct //!< ciphertext sector output data
+ );
+
+/** @brief XTS-AES-256 Decryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_dec(
+ uint8_t *k2, //!< key used for tweaking, 16*2 bytes
+ uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16*2 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *ct, //!< ciphertext sector input data
+ uint8_t *pt //!< plaintext sector output data
+ );
+
+/** @brief XTS-AES-256 Decryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_dec_expanded_key(
+ uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes - encryption key is used
+ uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*15 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *ct, //!< ciphertext sector input data
+ uint8_t *pt //!< plaintext sector output data
+ );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_AES_XTS_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm b/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm
new file mode 100644
index 000000000..3298ce374
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm
@@ -0,0 +1,79 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for defining data structures
+
+; Usage example
+
+;START_FIELDS ; JOB_AES
+;;; name size align
+;FIELD _plaintext, 8, 8 ; pointer to plaintext
+;FIELD _ciphertext, 8, 8 ; pointer to ciphertext
+;FIELD _IV, 16, 8 ; IV
+;FIELD _keys, 8, 8 ; pointer to keys
+;FIELD _len, 4, 4 ; length in bytes
+;FIELD _status, 4, 4 ; status enumeration
+;FIELD _user_data, 8, 8 ; pointer to user data
+;UNION _union, size1, align1, \
+ size2, align2, \
+ size3, align3, \
+ ...
+;END_FIELDS
+;%assign _JOB_AES_size _FIELD_OFFSET
+;%assign _JOB_AES_align _STRUCT_ALIGN
+
+%ifndef _DATASTRUCT_ASM_
+%define _DATASTRUCT_ASM_
+
+;; START_FIELDS
+%macro START_FIELDS 0
+%assign _FIELD_OFFSET 0
+%assign _STRUCT_ALIGN 0
+%endm
+
+;; FIELD name size align
+%macro FIELD 3
+%define %%name %1
+%define %%size %2
+%define %%align %3
+
+%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1))
+%%name equ _FIELD_OFFSET
+%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size)
+%if (%%align > _STRUCT_ALIGN)
+%assign _STRUCT_ALIGN %%align
+%endif
+%endm
+
+;; END_FIELDS
+%macro END_FIELDS 0
+%assign _FIELD_OFFSET (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+%endm
+
+%endif ; end ifdef _DATASTRUCT_ASM_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/endian_helper.h b/src/crypto/isa-l/isa-l_crypto/include/endian_helper.h
new file mode 100644
index 000000000..87d90460a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/endian_helper.h
@@ -0,0 +1,83 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _ENDIAN_HELPER_H_
+#define _ENDIAN_HELPER_H_
+
+/**
+ * @file endian_helper.h
+ * @brief Byte order helper routines
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined (__ICC)
+# define byteswap32(x) _bswap(x)
+# define byteswap64(x) _bswap64(x)
+#elif defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+# define byteswap32(x) __builtin_bswap32(x)
+# define byteswap64(x) __builtin_bswap64(x)
+#else
+# define byteswap32(x) ( ((x) << 24) \
+ | (((x) & 0xff00) << 8) \
+ | (((x) & 0xff0000) >> 8) \
+ | ((x)>>24))
+# define byteswap64(x) ( (((x) & (0xffull << 0)) << 56) \
+ | (((x) & (0xffull << 8)) << 40) \
+ | (((x) & (0xffull << 16)) << 24) \
+ | (((x) & (0xffull << 24)) << 8) \
+ | (((x) & (0xffull << 32)) >> 8) \
+ | (((x) & (0xffull << 40)) >> 24) \
+ | (((x) & (0xffull << 48)) >> 40) \
+ | (((x) & (0xffull << 56)) >> 56))
+#endif
+
+// This check works when using GCC (or LLVM). Assume little-endian
+// if any other compiler is being used.
+#if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) \
+ && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+#define to_le32(x) byteswap32(x)
+#define to_le64(x) byteswap64(x)
+#define to_be32(x) (x)
+#define to_be64(x) (x)
+#else
+#define to_le32(x) (x)
+#define to_le64(x) (x)
+#define to_be32(x) byteswap32(x)
+#define to_be64(x) byteswap64(x)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _ISA_HELPER_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h b/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h
new file mode 100644
index 000000000..3c7ba2877
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h
@@ -0,0 +1,65 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ * @file intrinreg.h
+ * @brief Defines intrinsic types used by the new hashing API
+ *
+ */
+
+#ifndef _IA64_REGS_H_
+#define _IA64_REGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+#include <stdint.h>
+#include <immintrin.h>
+
+// Define available register types uniformly.
+/// @cond
+typedef struct{ uint8_t dummy; } intrinreg1;
+typedef struct{ uint16_t dummy; } intrinreg2;
+typedef struct{ uint32_t dummy; } intrinreg4;
+typedef struct{ uint64_t dummy; } intrinreg8;
+typedef __m128 intrinreg16;
+/// @endcond
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _IA64_REGS_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h b/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h
new file mode 100644
index 000000000..fcbae5f62
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h
@@ -0,0 +1,372 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MD5_MB_H_
+#define _MD5_MB_H_
+
+/**
+ * @file md5_mb.h
+ * @brief Multi-buffer CTX API MD5 function prototypes and structures
+ *
+ * Interface for multi-buffer MD5 functions
+ *
+ * <b> Multi-buffer MD5 Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the MD5_HASH_CTX_MGR and
+ * MD5_HASH_CTX objects. Numerous MD5_HASH_CTX objects may be instantiated by the
+ * application for use with a single MD5_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The MD5 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and
+ * AVX512. In addition, a multibinary interface is provided, which selects the appropriate
+ * architecture-specific function at runtime.
+ *
+ * <b>Usage:</b> The application creates a MD5_HASH_CTX_MGR object and initializes it
+ * with a call to md5_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512 (or no suffix for the
+ * multibinary version). The MD5_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 8 MD5_HASH_CTX objects (or 16 in AVX2 case, 32 in AVX512 case)
+ * being processed at a time.
+ *
+ * Each MD5_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the MD5_HASH_CTX to a MD5_HASH_CTX_MGR using the submit functions
+ * md5_ctx_mgr_submit*() with the HASH_FIRST flag set. When the MD5_HASH_CTX is
+ * returned to the application (via this or a later call to md5_ctx_mgr_submit*() or
+ * md5_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * md5_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, md5_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a MD5_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The MD5_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ * <li> one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE)
+ * <li> two buffers: HASH_FIRST, HASH_LAST
+ * <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which MD5_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ * <li> Submitting flags other than the allowed entire/first/update/last values
+ * <li> Submitting a context that is currently being managed by a MD5_HASH_CTX_MGR.
+ * <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ * These error conditions are reported by returning the MD5_HASH_CTX immediately after
+ * a submit with its error member set to a non-zero error code (defined in
+ * multi_buffer.h). No changes are made to the MD5_HASH_CTX_MGR in the case of an
+ * error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define MD5_DIGEST_NWORDS 4
+#define MD5_MAX_LANES 32
+#define MD5_MIN_LANES 8
+#define MD5_BLOCK_SIZE 64
+#define MD5_LOG2_BLOCK_SIZE 6
+#define MD5_PADLENGTHFIELD_SIZE 8
+#define MD5_INITIAL_DIGEST \
+ 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476
+
+typedef uint32_t md5_digest_array[MD5_DIGEST_NWORDS][MD5_MAX_LANES];
+typedef uint32_t MD5_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single MD5 job for the multi-buffer manager */
+
+typedef struct {
+ uint8_t* buffer; //!< pointer to data buffer for this job
+ uint32_t len; //!< length of buffer for this job in blocks.
+ DECLARE_ALIGNED(uint32_t result_digest[MD5_DIGEST_NWORDS],64);
+ JOB_STS status; //!< output job status
+ void* user_data; //!< pointer for user's job-related data
+} MD5_JOB;
+
+/** @brief Scheduler layer - Holds arguments for submitted MD5 job */
+
+typedef struct {
+ md5_digest_array digest;
+ uint8_t* data_ptr[MD5_MAX_LANES];
+} MD5_MB_ARGS_X32;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+ MD5_JOB *job_in_lane;
+} MD5_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer MD5 jobs */
+
+typedef struct {
+ MD5_MB_ARGS_X32 args;
+ uint32_t lens[MD5_MAX_LANES];
+ uint64_t unused_lanes[4]; //!< each byte or nibble is index (0...31 or 15) of unused lanes.
+ MD5_LANE_DATA ldata[MD5_MAX_LANES];
+ uint32_t num_lanes_inuse;
+} MD5_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer MD5 jobs */
+
+typedef struct {
+ MD5_MB_JOB_MGR mgr;
+} MD5_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single MD5 job for the multi-buffer CTX manager */
+
+typedef struct {
+ MD5_JOB job; // Must be at struct offset 0.
+ HASH_CTX_STS status; //!< Context status flag
+ HASH_CTX_ERROR error; //!< Context error flag
+ uint64_t total_length; //!< Running counter of length processed for this CTX's job
+ const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job
+ uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes.
+ uint8_t partial_block_buffer[MD5_BLOCK_SIZE * 2]; //!< CTX partial blocks
+ uint32_t partial_block_buffer_length;
+ void* user_data; //!< pointer for user to keep any job-related data
+} MD5_HASH_CTX;
+
+/*******************************************************************
+ * CTX level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level MD5 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init_sse (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_sse (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_sse (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init_avx (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_avx (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_avx (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init_avx2 (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_avx2 (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_avx2 (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init_avx512 (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_avx512 (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_avx512 (MD5_HASH_CTX_MGR* mgr);
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush (MD5_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void md5_mb_mgr_init_sse (MD5_MB_JOB_MGR *state);
+MD5_JOB* md5_mb_mgr_submit_sse (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_sse (MD5_MB_JOB_MGR *state);
+
+#define md5_mb_mgr_init_avx md5_mb_mgr_init_sse
+MD5_JOB* md5_mb_mgr_submit_avx (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_avx (MD5_MB_JOB_MGR *state);
+
+void md5_mb_mgr_init_avx2 (MD5_MB_JOB_MGR *state);
+MD5_JOB* md5_mb_mgr_submit_avx2 (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_avx2 (MD5_MB_JOB_MGR *state);
+
+void md5_mb_mgr_init_avx512 (MD5_MB_JOB_MGR *state);
+MD5_JOB* md5_mb_mgr_submit_avx512 (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_avx512 (MD5_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _MD5_MB_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm b/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm
new file mode 100644
index 000000000..7cb153540
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm
@@ -0,0 +1,615 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifndef __MEMCPY_ASM__
+%define __MEMCPY_ASM__
+
+%include "reg_sizes.asm"
+
+
+; This file defines a series of macros to copy small to medium amounts
+; of data from memory to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
+; with the parameters defined as:
+; DST : register: pointer to dst (not modified)
+; SRC : register: pointer to src (not modified)
+; SIZE : register: length in bytes (not modified)
+; TMP0 : 64-bit temp GPR (clobbered)
+; TMP1 : 64-bit temp GPR (clobbered)
+; XTMP0 : temp XMM (clobbered)
+; XTMP1 : temp XMM (clobbered)
+; XTMP2 : temp XMM (clobbered)
+; XTMP3 : temp XMM (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; memcpy_<VEC>_<SZ><ZERO><RET>
+; where:
+; <VEC> is either "sse" or "avx" or "avx2"
+; <SZ> is either "64" or "128" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
+; it does a "ret" at the end
+;
+; For the avx2 versions, the temp XMM registers need to be YMM registers
+; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
+; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
+; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
+;
+; For example:
+; memcpy_sse_64 : SSE, 0 <= size < 64, falls through
+; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through
+; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret
+; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret
+;
+
+%macro memcpy_sse_64 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
+%endm
+
+%macro memcpy_sse_64_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
+%endm
+
+%macro memcpy_sse_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
+%endm
+
+%macro memcpy_sse_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
+%endm
+
+%macro memcpy_sse_64_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
+%endm
+
+%macro memcpy_sse_64_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
+%endm
+
+%macro memcpy_sse_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
+%endm
+
+%macro memcpy_sse_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
+%endm
+
+
+%macro memcpy_sse_16 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
+%endm
+
+%macro memcpy_sse_16_1 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx_64 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
+%endm
+
+%macro memcpy_avx_64_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
+%endm
+
+%macro memcpy_avx_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
+%endm
+
+%macro memcpy_avx_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
+%endm
+
+%macro memcpy_avx_64_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
+%endm
+
+%macro memcpy_avx_64_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
+%endm
+
+%macro memcpy_avx_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
+%endm
+
+%macro memcpy_avx_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
+%endm
+
+
+%macro memcpy_avx_16 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
+%endm
+
+%macro memcpy_avx_16_1 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx2_64 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_64_1 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_64_ret 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_64_1_ret 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 128, 1, 2
+%endm
+
+%macro memcpy_avx2_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 128, 1, 2
+%endm
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%macro __memcpy_int 13
+%define %%DST %1 ; register: pointer to dst (not modified)
+%define %%SRC %2 ; register: pointer to src (not modified)
+%define %%SIZE %3 ; register: length in bytes (not modified)
+%define %%TMP0 %4 ; 64-bit temp GPR (clobbered)
+%define %%TMP1 %5 ; 64-bit temp GPR (clobbered)
+%define %%XTMP0 %6 ; temp XMM (clobbered)
+%define %%XTMP1 %7 ; temp XMM (clobbered)
+%define %%XTMP2 %8 ; temp XMM (clobbered)
+%define %%XTMP3 %9 ; temp XMM (clobbered)
+%define %%NOT0 %10 ; if not 0, then assume size cannot be zero
+%define %%MAXSIZE %11 ; 128, 64, etc
+%define %%USERET %12 ; if not 0, use "ret" at end
+%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2
+
+%if (%%USERET != 0)
+ %define %%DONE ret
+%else
+ %define %%DONE jmp %%end
+%endif
+
+%if (%%USEAVX != 0)
+ %define %%MOVDQU vmovdqu
+%else
+ %define %%MOVDQU movdqu
+%endif
+
+%if (%%MAXSIZE >= 128)
+ test %%SIZE, 64
+ jz %%lt64
+ %if (%%USEAVX >= 2)
+ %%MOVDQU %%XTMP0, [%%SRC + 0*32]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*32]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32]
+
+ %%MOVDQU [%%DST + 0*32], %%XTMP0
+ %%MOVDQU [%%DST + 1*32], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*16]
+ %%MOVDQU %%XTMP2, [%%SRC + 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + 3*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + 1*16], %%XTMP1
+ %%MOVDQU [%%DST + 2*16], %%XTMP2
+ %%MOVDQU [%%DST + 3*16], %%XTMP3
+
+ %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 64)
+%%lt64
+ test %%SIZE, 32
+ jz %%lt32
+ %if (%%USEAVX >= 2)
+ %%MOVDQU %%XTMP0, [%%SRC + 0*32]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32]
+ %%MOVDQU [%%DST + 0*32], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*16]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + 1*16], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 32)
+%%lt32:
+ test %%SIZE, 16
+ jz %%lt16
+ %if (%%USEAVX >= 2)
+ %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16]
+ %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0)
+ %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 16)
+%%lt16:
+ test %%SIZE, 8
+ jz %%lt8
+ mov %%TMP0, [%%SRC]
+ mov %%TMP1, [%%SRC + %%SIZE - 8]
+ mov [%%DST], %%TMP0
+ mov [%%DST + %%SIZE - 8], %%TMP1
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 8)
+%%lt8:
+ test %%SIZE, 4
+ jz %%lt4
+ mov DWORD(%%TMP0), [%%SRC]
+ mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
+ mov [%%DST], DWORD(%%TMP0)
+ mov [%%DST + %%SIZE - 4], DWORD(%%TMP1)
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 4)
+%%lt4:
+ test %%SIZE, 2
+ jz %%lt2
+ movzx DWORD(%%TMP0), word [%%SRC]
+ movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
+ mov [%%DST], WORD(%%TMP0)
+ mov [%%DST + %%SIZE - 1], BYTE(%%TMP1)
+ %%DONE
+%endif
+
+%%lt2:
+%if (%%NOT0 == 0)
+ test %%SIZE, 1
+ jz %%end
+%endif
+ movzx DWORD(%%TMP0), byte [%%SRC]
+ mov [%%DST], BYTE(%%TMP0)
+%%end:
+%if (%%USERET != 0)
+ ret
+%endif
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Utility macro to assist with SIMD shifting
+%macro _PSRLDQ 3
+%define %%VEC %1
+%define %%REG %2
+%define %%IMM %3
+
+%ifidn %%VEC, SSE
+ psrldq %%REG, %%IMM
+%else
+ vpsrldq %%REG, %%REG, %%IMM
+%endif
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; This section defines a series of macros to store small to medium amounts
+; of data from SIMD registers to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP, IDX
+; with the parameters defined as:
+; DST : register: pointer to dst (not modified)
+; SRC : register: src data (clobbered)
+; SIZE : register: length in bytes (not modified)
+; TMP : 64-bit temp GPR (clobbered)
+; IDX : 64-bit GPR to store dst index/offset (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; simd_store_<VEC>
+; where <VEC> is the SIMD instruction type e.g. "sse" or "avx"
+
+
+%macro simd_store_sse 5
+ __simd_store %1,%2,%3,%4,%5,SSE
+%endm
+
+%macro simd_store_avx 5
+ __simd_store %1,%2,%3,%4,%5,AVX
+%endm
+
+%macro simd_store_sse_15 5
+ __simd_store %1,%2,%3,%4,%5,SSE,15
+%endm
+
+%macro simd_store_avx_15 5
+ __simd_store %1,%2,%3,%4,%5,AVX,15
+%endm
+
+%macro __simd_store 6-7
+%define %%DST %1 ; register: pointer to dst (not modified)
+%define %%SRC %2 ; register: src data (clobbered)
+%define %%SIZE %3 ; register: length in bytes (not modified)
+%define %%TMP %4 ; 64-bit temp GPR (clobbered)
+%define %%IDX %5 ; 64-bit temp GPR to store dst idx (clobbered)
+%define %%SIMDTYPE %6 ; "SSE" or "AVX"
+%define %%MAX_LEN %7 ; [optional] maximum length to be stored, default 16
+
+%define %%PSRLDQ _PSRLDQ %%SIMDTYPE,
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%MOVQ movq
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%MOVQ vmovq
+%endif
+
+;; determine max byte size for store operation
+%if %0 > 6
+%assign max_length_to_store %%MAX_LEN
+%else
+%assign max_length_to_store 16
+%endif
+
+%if max_length_to_store > 16
+%error "__simd_store macro invoked with MAX_LEN bigger than 16!"
+%endif
+
+ xor %%IDX, %%IDX ; zero idx
+
+%if max_length_to_store == 16
+ test %%SIZE, 16
+ jz %%lt16
+ %%MOVDQU [%%DST], %%SRC
+ jmp %%end
+%%lt16:
+%endif
+
+%if max_length_to_store >= 8
+ test %%SIZE, 8
+ jz %%lt8
+ %%MOVQ [%%DST + %%IDX], %%SRC
+ %%PSRLDQ %%SRC, 8
+ add %%IDX, 8
+%%lt8:
+%endif
+
+ %%MOVQ %%TMP, %%SRC ; use GPR from now on
+
+%if max_length_to_store >= 4
+ test %%SIZE, 4
+ jz %%lt4
+ mov [%%DST + %%IDX], DWORD(%%TMP)
+ shr %%TMP, 32
+ add %%IDX, 4
+%%lt4:
+%endif
+
+ test %%SIZE, 2
+ jz %%lt2
+ mov [%%DST + %%IDX], WORD(%%TMP)
+ shr %%TMP, 16
+ add %%IDX, 2
+%%lt2:
+ test %%SIZE, 1
+ jz %%end
+ mov [%%DST + %%IDX], BYTE(%%TMP)
+%%end:
+%endm
+
+; This section defines a series of macros to load small to medium amounts
+; (from 0 to 16 bytes) of data from memory to SIMD registers,
+; where the size is variable but limited.
+;
+; The macros are all called as:
+; simd_load DST, SRC, SIZE
+; with the parameters defined as:
+; DST : register: destination XMM register
+; SRC : register: pointer to src data (not modified)
+; SIZE : register: length in bytes (not modified)
+;
+; The name indicates the options. The name is of the form:
+; simd_load_<VEC>_<SZ><ZERO>
+; where:
+; <VEC> is either "sse" or "avx"
+; <SZ> is either "15" or "16" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+;
+; For example:
+; simd_load_sse_16 : SSE, 0 <= size <= 16
+; simd_load_avx_15_1 : AVX, 1 <= size <= 15
+
+%macro simd_load_sse_15_1 3
+ __simd_load %1,%2,%3,0,0,SSE
+%endm
+%macro simd_load_sse_15 3
+ __simd_load %1,%2,%3,1,0,SSE
+%endm
+%macro simd_load_sse_16_1 3
+ __simd_load %1,%2,%3,0,1,SSE
+%endm
+%macro simd_load_sse_16 3
+ __simd_load %1,%2,%3,1,1,SSE
+%endm
+
+%macro simd_load_avx_15_1 3
+ __simd_load %1,%2,%3,0,0,AVX
+%endm
+%macro simd_load_avx_15 3
+ __simd_load %1,%2,%3,1,0,AVX
+%endm
+%macro simd_load_avx_16_1 3
+ __simd_load %1,%2,%3,0,1,AVX
+%endm
+%macro simd_load_avx_16 3
+ __simd_load %1,%2,%3,1,1,AVX
+%endm
+
+%macro __simd_load 6
+%define %%DST %1 ; [out] destination XMM register
+%define %%SRC %2 ; [in] pointer to src data
+%define %%SIZE %3 ; [in] length in bytes (0-16 bytes)
+%define %%ACCEPT_0 %4 ; 0 = min length = 1, 1 = min length = 0
+%define %%ACCEPT_16 %5 ; 0 = max length = 15 , 1 = max length = 16
+%define %%SIMDTYPE %6 ; "SSE" or "AVX"
+
+%ifidn %%SIMDTYPE, SSE
+ %define %%MOVDQU movdqu
+ %define %%PINSRB pinsrb
+ %define %%PINSRQ pinsrq
+ %define %%PXOR pxor
+%else
+ %define %%MOVDQU vmovdqu
+ %define %%PINSRB vpinsrb
+ %define %%PINSRQ vpinsrq
+ %define %%PXOR vpxor
+%endif
+
+%if (%%ACCEPT_16 != 0)
+ test %%SIZE, 16
+ jz %%_skip_16
+ %%MOVDQU %%DST, [%%SRC]
+ jmp %%end_load
+
+%%_skip_16:
+%endif
+ %%PXOR %%DST, %%DST ; clear XMM register
+%if (%%ACCEPT_0 != 0)
+ or %%SIZE, %%SIZE
+ je %%end_load
+%endif
+ cmp %%SIZE, 1
+ je %%_size_1
+ cmp %%SIZE, 2
+ je %%_size_2
+ cmp %%SIZE, 3
+ je %%_size_3
+ cmp %%SIZE, 4
+ je %%_size_4
+ cmp %%SIZE, 5
+ je %%_size_5
+ cmp %%SIZE, 6
+ je %%_size_6
+ cmp %%SIZE, 7
+ je %%_size_7
+ cmp %%SIZE, 8
+ je %%_size_8
+ cmp %%SIZE, 9
+ je %%_size_9
+ cmp %%SIZE, 10
+ je %%_size_10
+ cmp %%SIZE, 11
+ je %%_size_11
+ cmp %%SIZE, 12
+ je %%_size_12
+ cmp %%SIZE, 13
+ je %%_size_13
+ cmp %%SIZE, 14
+ je %%_size_14
+
+%%_size_15:
+ %%PINSRB %%DST, [%%SRC + 14], 14
+%%_size_14:
+ %%PINSRB %%DST, [%%SRC + 13], 13
+%%_size_13:
+ %%PINSRB %%DST, [%%SRC + 12], 12
+%%_size_12:
+ %%PINSRB %%DST, [%%SRC + 11], 11
+%%_size_11:
+ %%PINSRB %%DST, [%%SRC + 10], 10
+%%_size_10:
+ %%PINSRB %%DST, [%%SRC + 9], 9
+%%_size_9:
+ %%PINSRB %%DST, [%%SRC + 8], 8
+%%_size_8:
+ %%PINSRQ %%DST, [%%SRC], 0
+ jmp %%end_load
+%%_size_7:
+ %%PINSRB %%DST, [%%SRC + 6], 6
+%%_size_6:
+ %%PINSRB %%DST, [%%SRC + 5], 5
+%%_size_5:
+ %%PINSRB %%DST, [%%SRC + 4], 4
+%%_size_4:
+ %%PINSRB %%DST, [%%SRC + 3], 3
+%%_size_3:
+ %%PINSRB %%DST, [%%SRC + 2], 2
+%%_size_2:
+ %%PINSRB %%DST, [%%SRC + 1], 1
+%%_size_1:
+ %%PINSRB %%DST, [%%SRC + 0], 0
+%%end_load:
+%endm
+
+%endif ; ifndef __MEMCPY_ASM__
diff --git a/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h b/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h
new file mode 100644
index 000000000..e0cc314d1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h
@@ -0,0 +1,375 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ * @file memcpy_inline.h
+ * @brief Defines intrinsic memcpy functions used by the new hashing API
+ *
+ */
+
+#ifndef _MEMCPY_H_
+#define _MEMCPY_H_
+
+#if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \
+ || defined(_M_IX86)
+#include "intrinreg.h"
+#endif
+#include <string.h>
+#include <assert.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__i386__) || defined(__x86_64__) || defined( _M_X64) \
+ || defined(_M_IX86)
+
+#define memcpy_varlen memcpy_sse_varlen
+#define memcpy_fixedlen memcpy_sse_fixedlen
+
+#define memclr_varlen memclr_sse_varlen
+#define memclr_fixedlen memclr_sse_fixedlen
+
+static inline void memcpy_lte32_sse_fixedlen(void* dst, const void* src, size_t nbytes);
+static inline void memcpy_gte16_sse_fixedlen(void* dst, const void* src, size_t nbytes);
+static inline void memcpy_sse_fixedlen (void* dst, const void* src, size_t nbytes);
+
+static inline void memcpy_lte32_sse_varlen (void* dst, const void* src, size_t nbytes);
+static inline void memcpy_gte16_sse_varlen (void* dst, const void* src, size_t nbytes);
+static inline void memcpy_sse_varlen (void* dst, const void* src, size_t nbytes);
+
+
+static inline void memclr_lte32_sse_fixedlen(void* dst, size_t nbytes);
+static inline void memclr_gte16_sse_fixedlen(void* dst, size_t nbytes);
+static inline void memclr_sse_fixedlen (void* dst, size_t nbytes);
+
+static inline void memclr_lte32_sse_varlen (void* dst, size_t nbytes);
+static inline void memclr_gte16_sse_varlen (void* dst, size_t nbytes);
+static inline void memclr_sse_varlen (void* dst, size_t nbytes);
+
+#define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes) \
+ do { \
+ intrinreg##N head; \
+ intrinreg##N tail; \
+ assert(N <= nbytes && nbytes <= 2*N); \
+ if(N == 1 || (fixedwidth && nbytes==N) ) { \
+ head = load_intrinreg##N(src); \
+ store_intrinreg##N(dst, head); \
+ } \
+ else { \
+ head = load_intrinreg##N(src); \
+ tail = load_intrinreg##N((const void*)((const char*)src + (nbytes - N))); \
+ store_intrinreg##N(dst, head); \
+ store_intrinreg##N((void*)((char*)dst + (nbytes - N)), tail); \
+ } \
+ } while(0)
+
+#define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes) \
+ do { \
+ const intrinreg##N zero = {0}; \
+ assert(N <= nbytes && nbytes <= 2*N); \
+ if(N == 1 || (fixedwidth && nbytes==N) ) { \
+ store_intrinreg##N(dst, zero); \
+ } \
+ else { \
+ store_intrinreg##N(dst, zero); \
+ store_intrinreg##N((void*)((char*)dst + (nbytes - N)), zero); \
+ } \
+ } while(0)
+
+// Define load/store functions uniformly.
+
+#define load_intrinreg16(src) _mm_loadu_ps((const float*) src)
+#define store_intrinreg16(dst,val) _mm_storeu_ps((float*) dst, val)
+
+static inline intrinreg8 load_intrinreg8(const void *src)
+{
+ return *(intrinreg8 *) src;
+}
+
+static inline void store_intrinreg8(void *dst, intrinreg8 val)
+{
+ *(intrinreg8 *) dst = val;
+}
+
+static inline intrinreg4 load_intrinreg4(const void *src)
+{
+ return *(intrinreg4 *) src;
+}
+
+static inline void store_intrinreg4(void *dst, intrinreg4 val)
+{
+ *(intrinreg4 *) dst = val;
+}
+
+static inline intrinreg2 load_intrinreg2(const void *src)
+{
+ return *(intrinreg2 *) src;
+}
+
+static inline void store_intrinreg2(void *dst, intrinreg2 val)
+{
+ *(intrinreg2 *) dst = val;
+}
+
+static inline intrinreg1 load_intrinreg1(const void *src)
+{
+ return *(intrinreg1 *) src;
+}
+
+static inline void store_intrinreg1(void *dst, intrinreg1 val)
+{
+ *(intrinreg1 *) dst = val;
+}
+
+static inline void memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes)
+{
+ size_t i;
+ size_t j;
+ intrinreg16 pool[4];
+ size_t remaining_moves;
+ size_t tail_offset;
+ int do_tail;
+ assert(nbytes >= 16);
+
+ for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) {
+ for (j = 0; j < 4; j++)
+ pool[j] =
+ load_intrinreg16((const void *)((const char *)src + i + 16 * j));
+ for (j = 0; j < 4; j++)
+ store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]);
+ }
+
+ remaining_moves = (nbytes - i) / 16;
+ tail_offset = nbytes - 16;
+ do_tail = (tail_offset & (16 - 1));
+
+ for (j = 0; j < remaining_moves; j++)
+ pool[j] = load_intrinreg16((const void *)((const char *)src + i + 16 * j));
+
+ if (do_tail)
+ pool[j] = load_intrinreg16((const void *)((const char *)src + tail_offset));
+
+ for (j = 0; j < remaining_moves; j++)
+ store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]);
+
+ if (do_tail)
+ store_intrinreg16((void *)((char *)dst + tail_offset), pool[j]);
+}
+
+static inline void memclr_gte16_sse_fixedlen(void *dst, size_t nbytes)
+{
+ size_t i;
+ size_t j;
+ const intrinreg16 zero = { 0 };
+ size_t remaining_moves;
+ size_t tail_offset;
+ int do_tail;
+ assert(nbytes >= 16);
+
+ for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4)
+ for (j = 0; j < 4; j++)
+ store_intrinreg16((void *)((char *)dst + i + 16 * j), zero);
+
+ remaining_moves = (nbytes - i) / 16;
+ tail_offset = nbytes - 16;
+ do_tail = (tail_offset & (16 - 1));
+
+ for (j = 0; j < remaining_moves; j++)
+ store_intrinreg16((void *)((char *)dst + i + 16 * j), zero);
+
+ if (do_tail)
+ store_intrinreg16((void *)((char *)dst + tail_offset), zero);
+}
+
+static inline void memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes)
+{
+ assert(nbytes <= 32);
+ if (nbytes >= 16)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst, src, nbytes);
+ else if (nbytes >= 8)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst, src, nbytes);
+ else if (nbytes >= 4)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst, src, nbytes);
+ else if (nbytes >= 2)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst, src, nbytes);
+ else if (nbytes >= 1)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst, src, nbytes);
+}
+
+static inline void memclr_lte32_sse_fixedlen(void *dst, size_t nbytes)
+{
+ assert(nbytes <= 32);
+ if (nbytes >= 16)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst, nbytes);
+ else if (nbytes >= 8)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst, nbytes);
+ else if (nbytes >= 4)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst, nbytes);
+ else if (nbytes >= 2)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst, nbytes);
+ else if (nbytes >= 1)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst, nbytes);
+}
+
+static inline void memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes)
+{
+ assert(nbytes <= 32);
+ if (nbytes >= 16)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst, src, nbytes);
+ else if (nbytes >= 8)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst, src, nbytes);
+ else if (nbytes >= 4)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst, src, nbytes);
+ else if (nbytes >= 2)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst, src, nbytes);
+ else if (nbytes >= 1)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst, src, nbytes);
+}
+
+static inline void memclr_lte32_sse_varlen(void *dst, size_t nbytes)
+{
+ assert(nbytes <= 32);
+ if (nbytes >= 16)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst, nbytes);
+ else if (nbytes >= 8)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst, nbytes);
+ else if (nbytes >= 4)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst, nbytes);
+ else if (nbytes >= 2)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst, nbytes);
+ else if (nbytes >= 1)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst, nbytes);
+}
+
+static inline void memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes)
+{
+ size_t i = 0;
+ intrinreg16 tail;
+
+ assert(nbytes >= 16);
+
+ while (i + 128 <= nbytes) {
+ memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+ (const void *)((const char *)src + i), 128);
+ i += 128;
+ }
+ if (i + 64 <= nbytes) {
+ memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+ (const void *)((const char *)src + i), 64);
+ i += 64;
+ }
+ if (i + 32 <= nbytes) {
+ memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+ (const void *)((const char *)src + i), 32);
+ i += 32;
+ }
+ if (i + 16 <= nbytes) {
+ memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+ (const void *)((const char *)src + i), 16);
+ }
+
+ i = nbytes - 16;
+ tail = load_intrinreg16((const void *)((const char *)src + i));
+ store_intrinreg16((void *)((char *)dst + i), tail);
+}
+
+static inline void memclr_gte16_sse_varlen(void *dst, size_t nbytes)
+{
+ size_t i = 0;
+ const intrinreg16 zero = { 0 };
+
+ assert(nbytes >= 16);
+
+ while (i + 128 <= nbytes) {
+ memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 128);
+ i += 128;
+ }
+ if (i + 64 <= nbytes) {
+ memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 64);
+ i += 64;
+ }
+ if (i + 32 <= nbytes) {
+ memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 32);
+ i += 32;
+ }
+ if (i + 16 <= nbytes) {
+ memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 16);
+ }
+
+ i = nbytes - 16;
+ store_intrinreg16((void *)((char *)dst + i), zero);
+}
+
+static inline void memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes)
+{
+ if (nbytes >= 16)
+ memcpy_gte16_sse_fixedlen(dst, src, nbytes);
+ else
+ memcpy_lte32_sse_fixedlen(dst, src, nbytes);
+}
+
+static inline void memclr_sse_fixedlen(void *dst, size_t nbytes)
+{
+ if (nbytes >= 16)
+ memclr_gte16_sse_fixedlen(dst, nbytes);
+ else
+ memclr_lte32_sse_fixedlen(dst, nbytes);
+}
+
+static inline void memcpy_sse_varlen(void *dst, const void *src, size_t nbytes)
+{
+ if (nbytes >= 16)
+ memcpy_gte16_sse_varlen(dst, src, nbytes);
+ else
+ memcpy_lte32_sse_varlen(dst, src, nbytes);
+}
+
+static inline void memclr_sse_varlen(void *dst, size_t nbytes)
+{
+ if (nbytes >= 16)
+ memclr_gte16_sse_varlen(dst, nbytes);
+ else
+ memclr_lte32_sse_varlen(dst, nbytes);
+}
+#else
+#define memcpy_varlen memcpy
+#define memcpy_fixedlen memcpy
+
+#define memclr_varlen(dst,n) memset(dst,0,n)
+#define memclr_fixedlen(dst,n) memset(dst,0,n)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __MEMCPY_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h
new file mode 100644
index 000000000..eac3be031
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h
@@ -0,0 +1,315 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_H_
+#define _MH_SHA1_H_
+
+/**
+ * @file mh_sha1.h
+ * @brief mh_sha1 function prototypes and structures
+ *
+ * Interface for mh_sha1 functions
+ *
+ * <b> mh_sha1 Init-Update..Update-Finalize </b>
+ *
+ * This file defines the interface to optimized functions used in mh_sha1.
+ * The definition of multi-hash SHA1(mh_sha1, for short) is: Pad the buffer
+ * in SHA1 style until the total length is a multiple of 4*16*16
+ * (words-width * parallel-segments * block-size); Hash the buffer in
+ * parallel, generating digests of 4*16*5 (words-width*parallel-segments*
+ * digest-size); Treat the set of digests as another data buffer, and
+ * generate a final SHA1 digest for it.
+ *
+ *
+ * Example
+ * \code
+ * uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS];
+ * struct mh_sha1_ctx *ctx;
+ *
+ * ctx = malloc(sizeof(struct mh_sha1_ctx));
+ * mh_sha1_init(ctx);
+ * mh_sha1_update(ctx, buff, block_len);
+ * mh_sha1_finalize(ctx, mh_sha1_digest);
+ * \endcode
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// External Interface Definition
+#define HASH_SEGS 16
+#define SHA1_BLOCK_SIZE 64
+#define MH_SHA1_BLOCK_SIZE (HASH_SEGS * SHA1_BLOCK_SIZE)
+#define SHA1_DIGEST_WORDS 5
+#define AVX512_ALIGNED 64
+
+/** @brief Holds info describing a single mh_sha1
+ *
+ * It is better to use heap to allocate this data structure to avoid stack overflow.
+ *
+*/
+struct mh_sha1_ctx {
+ uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]; //!< the digest of multi-hash SHA1
+
+ uint64_t total_length;
+ //!< Parameters for update feature, describe the lengths of input buffers in bytes
+ uint8_t partial_block_buffer [MH_SHA1_BLOCK_SIZE * 2];
+ //!< Padding the tail of input data for SHA1
+ uint8_t mh_sha1_interim_digests[sizeof(uint32_t) * SHA1_DIGEST_WORDS * HASH_SEGS];
+ //!< Storing the SHA1 interim digests of all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose.
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE + AVX512_ALIGNED];
+ //!< Re-structure sha1 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose.
+};
+
+/**
+ * @enum mh_sha1_ctx_error
+ * @brief CTX error flags
+ */
+enum mh_sha1_ctx_error{
+ MH_SHA1_CTX_ERROR_NONE = 0, //!< MH_SHA1_CTX_ERROR_NONE
+ MH_SHA1_CTX_ERROR_NULL = -1, //!< MH_SHA1_CTX_ERROR_NULL
+};
+
+
+/*******************************************************************
+ * mh_sha1 API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the mh_sha1_ctx structure.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_init (struct mh_sha1_ctx* ctx);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_update (struct mh_sha1_ctx * ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for multi-hash sha1.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_finalize (struct mh_sha1_ctx* ctx, void* mh_sha1_digest);
+
+/*******************************************************************
+ * multi-types of mh_sha1 internal API
+ *
+ * XXXX The multi-binary version
+ * XXXX_base The C code version which used to display the algorithm
+ * XXXX_sse The version uses a ASM function optimized for SSE
+ * XXXX_avx The version uses a ASM function optimized for AVX
+ * XXXX_avx2 The version uses a ASM function optimized for AVX2
+ * XXXX_avx512 The version uses a ASM function optimized for AVX512
+ *
+ ******************************************************************/
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * Base update() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_base (struct mh_sha1_ctx* ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_sse (struct mh_sha1_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_avx (struct mh_sha1_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_avx2 (struct mh_sha1_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_avx512 (struct mh_sha1_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+
+/**
+ * @brief Finalize the message digests for multi-hash sha1.
+ *
+ * Place the message digests in mh_sha1_digest,
+ * which must have enough space for the outputs.
+ * Base Finalize() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_base (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_sse (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_avx (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_avx2 (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_avx512 (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h
new file mode 100644
index 000000000..1c07306ec
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h
@@ -0,0 +1,327 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_MURMUR3_X64_128_H_
+#define _MH_SHA1_MURMUR3_X64_128_H_
+
+/**
+ * @file mh_sha1_murmur3_x64_128.h
+ * @brief mh_sha1_murmur3_x64_128 function prototypes and structures
+ *
+ * Interface for mh_sha1_murmur3_x64_128 functions
+ *
+ * <b> mh_sha1_murmur3_x64_128 Init-Update..Update-Finalize </b>
+ *
+ * This file defines the interface to optimized functions used in mh_sha1 and
+ * mh_sha1_murmur3_x64_128. The definition of multi-hash SHA1(mh_sha1,
+ * for short) is: Pad the buffer in SHA1 style until the total length is a multiple
+ * of 4*16*16(words-width * parallel-segments * block-size); Hash the buffer
+ * in parallel, generating digests of 4*16*5 (words-width*parallel-segments*
+ * digest-size); Treat the set of digests as another data buffer, and generate
+ * a final SHA1 digest for it. mh_sha1_murmur3_x64_128 is a stitching function
+ * which will get a murmur3_x64_128 digest while generate mh_sha1 digest.
+ *
+ *
+ * Example
+ * \code
+ * uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS];
+ * uint32_t murmur_digest[MURMUR3_x64_128_DIGEST_WORDS];
+ * struct mh_sha1_murmur3_x64_128_ctx *ctx;
+ *
+ * ctx = malloc(sizeof(struct mh_sha1_murmur3_x64_128_ctx));
+ * mh_sha1_murmur3_x64_128_init(ctx, 0);
+ * mh_sha1_murmur3_x64_128_update(ctx, buff, block_len);
+ * mh_sha1_murmur3_x64_128_finalize(ctx, mh_sha1_digest,
+ * murmur_digest);
+ * \endcode
+ */
+
+#include <stdint.h>
+#include "mh_sha1.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// External Interface Definition
+// Add murmur3_x64_128 definition
+#define MUR_BLOCK_SIZE (2 * sizeof(uint64_t))
+#define MURMUR3_x64_128_DIGEST_WORDS 4
+
+/** @brief Holds info describing a single mh_sha1_murmur3_x64_128
+ *
+ * It is better to use heap to allocate this data structure to avoid stack overflow.
+ *
+*/
+struct mh_sha1_murmur3_x64_128_ctx {
+ uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]; //!< the digest of multi-hash SHA1
+ uint32_t murmur3_x64_128_digest[MURMUR3_x64_128_DIGEST_WORDS]; //!< the digest of murmur3_x64_128
+
+ uint64_t total_length;
+ //!< Parameters for update feature, describe the lengths of input buffers in bytes
+ uint8_t partial_block_buffer [MH_SHA1_BLOCK_SIZE * 2];
+ //!< Padding the tail of input data for SHA1
+ uint8_t mh_sha1_interim_digests[sizeof(uint32_t) * SHA1_DIGEST_WORDS * HASH_SEGS];
+ //!< Storing the SHA1 interim digests of all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose.
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE + AVX512_ALIGNED];
+ //!< Re-structure sha1 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose.
+};
+
+/**
+ * @enum mh_sha1_murmur3_ctx_error
+ * @brief CTX error flags
+ */
+enum mh_sha1_murmur3_ctx_error{
+ MH_SHA1_MURMUR3_CTX_ERROR_NONE = 0, //!< MH_SHA1_MURMUR3_CTX_ERROR_NONE
+ MH_SHA1_MURMUR3_CTX_ERROR_NULL = -1, //!<MH_SHA1_MURMUR3_CTX_ERROR_NULL
+};
+
+
+/*******************************************************************
+ * mh_sha1_murmur3_x64_128 API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the mh_sha1_murmur3_x64_128_ctx structure.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param murmur_seed Seed as an initial digest of murmur3
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_murmur3_x64_128_init (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ uint64_t murmur_seed);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_murmur3_x64_128_update (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_murmur3_x64_128_finalize (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/*******************************************************************
+ * multi-types of mh_sha1_murmur3_x64_128 internal API
+ *
+ * XXXX The multi-binary version
+ * XXXX_base The C code version which used to display the algorithm
+ * XXXX_sse The version uses a ASM function optimized for SSE
+ * XXXX_avx The version uses a ASM function optimized for AVX
+ * XXXX_avx2 The version uses a ASM function optimized for AVX2
+ *
+ ******************************************************************/
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * Base update() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_base (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_sse (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_avx (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_avx2 (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_avx512 (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ * Base Finalize() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_base (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_sse (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_avx (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_avx2 (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_avx512 (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha256.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha256.h
new file mode 100644
index 000000000..4a453833e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha256.h
@@ -0,0 +1,315 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA256_H_
+#define _MH_SHA256_H_
+
+/**
+ * @file mh_sha256.h
+ * @brief mh_sha256 function prototypes and structures
+ *
+ * Interface for mh_sha256 functions
+ *
+ * <b> mh_sha256 Init-Update..Update-Finalize </b>
+ *
+ * This file defines the interface to optimized functions used in mh_sha256.
+ * The definition of multi-hash SHA256(mh_sha256, for short) is: Pad the buffer
+ * in SHA256 style until the total length is a multiple of 4*16*16
+ * (words-width * parallel-segments * block-size); Hash the buffer in
+ * parallel, generating digests of 4*16*8 (words-width*parallel-segments*
+ * digest-size); Treat the set of digests as another data buffer, and
+ * generate a final SHA256 digest for it.
+ *
+ *
+ * Example
+ * \code
+ * uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS];
+ * struct mh_sha256_ctx *ctx;
+ *
+ * ctx = malloc(sizeof(struct mh_sha256_ctx));
+ * mh_sha256_init(ctx);
+ * mh_sha256_update(ctx, buff, block_len);
+ * mh_sha256_finalize(ctx, mh_sha256_digest);
+ * \endcode
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// External Interface Definition
+#define HASH_SEGS 16
+#define SHA256_BLOCK_SIZE 64
+#define MH_SHA256_BLOCK_SIZE (HASH_SEGS * SHA256_BLOCK_SIZE)
+#define SHA256_DIGEST_WORDS 8
+#define AVX512_ALIGNED 64
+
+/** @brief Holds info describing a single mh_sha256
+ *
+ * It is better to use heap to allocate this data structure to avoid stack overflow.
+ *
+*/
+struct mh_sha256_ctx {
+ uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]; //!< the digest of multi-hash SHA256
+
+ uint64_t total_length;
+ //!< Parameters for update feature, describe the lengths of input buffers in bytes
+ uint8_t partial_block_buffer [MH_SHA256_BLOCK_SIZE * 2];
+ //!< Padding the tail of input data for SHA256
+ uint8_t mh_sha256_interim_digests[sizeof(uint32_t) * SHA256_DIGEST_WORDS * HASH_SEGS];
+ //!< Storing the SHA256 interim digests of all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose.
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE + AVX512_ALIGNED];
+ //!< Re-structure sha256 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose.
+};
+
+/**
+ * @enum mh_sha256_ctx_error
+ * @brief CTX error flags
+ */
+enum mh_sha256_ctx_error{
+ MH_SHA256_CTX_ERROR_NONE = 0, //!< MH_SHA256_CTX_ERROR_NONE
+ MH_SHA256_CTX_ERROR_NULL = -1, //!< MH_SHA256_CTX_ERROR_NULL
+};
+
+
+/*******************************************************************
+ * mh_sha256 API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the mh_sha256_ctx structure.
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha256_init (struct mh_sha256_ctx* ctx);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha256_update (struct mh_sha256_ctx * ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for multi-hash sha256.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha256_finalize (struct mh_sha256_ctx* ctx, void* mh_sha256_digest);
+
+/*******************************************************************
+ * multi-types of mh_sha256 internal API
+ *
+ * XXXX The multi-binary version
+ * XXXX_base The C code version which used to display the algorithm
+ * XXXX_sse The version uses a ASM function optimized for SSE
+ * XXXX_avx The version uses a ASM function optimized for AVX
+ * XXXX_avx2 The version uses a ASM function optimized for AVX2
+ * XXXX_avx512 The version uses a ASM function optimized for AVX512
+ *
+ ******************************************************************/
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * Base update() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_base (struct mh_sha256_ctx* ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_sse (struct mh_sha256_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_avx (struct mh_sha256_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_avx2 (struct mh_sha256_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha256 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_update_avx512 (struct mh_sha256_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+
+/**
+ * @brief Finalize the message digests for multi-hash sha256.
+ *
+ * Place the message digests in mh_sha256_digest,
+ * which must have enough space for the outputs.
+ * Base Finalize() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_base (struct mh_sha256_ctx* ctx,
+ void* mh_sha256_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_sse (struct mh_sha256_ctx* ctx,
+ void* mh_sha256_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_avx (struct mh_sha256_ctx* ctx,
+ void* mh_sha256_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_avx2 (struct mh_sha256_ctx* ctx,
+ void* mh_sha256_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha256_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha256 info
+ * @param mh_sha256_digest The digest of mh_sha256
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha256_finalize_avx512 (struct mh_sha256_ctx* ctx,
+ void* mh_sha256_digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h b/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h
new file mode 100644
index 000000000..ac88f7b0a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h
@@ -0,0 +1,112 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MULTI_BUFFER_H_
+#define _MULTI_BUFFER_H_
+
+/**
+ * @file multi_buffer.h
+ * @brief Multi-buffer common fields
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @enum JOB_STS
+ * @brief Job return codes
+ */
+
+typedef enum {STS_UNKNOWN = 0, //!< STS_UNKNOWN
+ STS_BEING_PROCESSED = 1,//!< STS_BEING_PROCESSED
+ STS_COMPLETED = 2, //!< STS_COMPLETED
+ STS_INTERNAL_ERROR, //!< STS_INTERNAL_ERROR
+ STS_ERROR //!< STS_ERROR
+} JOB_STS;
+
+#define HASH_MB_NO_FLAGS 0
+#define HASH_MB_FIRST 1
+#define HASH_MB_LAST 2
+
+/* Common flags for the new API only
+ * */
+
+/**
+ * @enum HASH_CTX_FLAG
+ * @brief CTX job type
+ */
+typedef enum {
+ HASH_UPDATE = 0x00, //!< HASH_UPDATE
+ HASH_FIRST = 0x01, //!< HASH_FIRST
+ HASH_LAST = 0x02, //!< HASH_LAST
+ HASH_ENTIRE = 0x03, //!< HASH_ENTIRE
+} HASH_CTX_FLAG;
+
+/**
+ * @enum HASH_CTX_STS
+ * @brief CTX status flags
+ */
+typedef enum {
+ HASH_CTX_STS_IDLE = 0x00, //!< HASH_CTX_STS_IDLE
+ HASH_CTX_STS_PROCESSING = 0x01, //!< HASH_CTX_STS_PROCESSING
+ HASH_CTX_STS_LAST = 0x02, //!< HASH_CTX_STS_LAST
+ HASH_CTX_STS_COMPLETE = 0x04, //!< HASH_CTX_STS_COMPLETE
+} HASH_CTX_STS;
+
+/**
+ * @enum HASH_CTX_ERROR
+ * @brief CTX error flags
+ */
+typedef enum {
+ HASH_CTX_ERROR_NONE = 0, //!< HASH_CTX_ERROR_NONE
+ HASH_CTX_ERROR_INVALID_FLAGS = -1, //!< HASH_CTX_ERROR_INVALID_FLAGS
+ HASH_CTX_ERROR_ALREADY_PROCESSING = -2, //!< HASH_CTX_ERROR_ALREADY_PROCESSING
+ HASH_CTX_ERROR_ALREADY_COMPLETED = -3, //!< HASH_CTX_ERROR_ALREADY_COMPLETED
+} HASH_CTX_ERROR;
+
+
+#define hash_ctx_user_data(ctx) ((ctx)->user_data)
+#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx) ((ctx)->status)
+#define hash_ctx_error(ctx) ((ctx)->error)
+#define hash_ctx_init(ctx) \
+ do { \
+ (ctx)->error = HASH_CTX_ERROR_NONE; \
+ (ctx)->status = HASH_CTX_STS_COMPLETE; \
+ } while(0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _MULTI_BUFFER_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm b/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm
new file mode 100644
index 000000000..4dd019319
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm
@@ -0,0 +1,517 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _MULTIBINARY_ASM_
+%define _MULTIBINARY_ASM_
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ %define mbin_def_ptr dd
+ %define mbin_ptr_sz dword
+ %define mbin_rdi edi
+ %define mbin_rsi esi
+ %define mbin_rax eax
+ %define mbin_rbx ebx
+ %define mbin_rcx ecx
+ %define mbin_rdx edx
+%else
+ %define mbin_def_ptr dq
+ %define mbin_ptr_sz qword
+ %define mbin_rdi rdi
+ %define mbin_rsi rsi
+ %define mbin_rax rax
+ %define mbin_rbx rbx
+ %define mbin_rcx rcx
+ %define mbin_rdx rdx
+%endif
+
+%ifndef AS_FEATURE_LEVEL
+%define AS_FEATURE_LEVEL 4
+%endif
+
+;;;;
+; multibinary macro:
+; creates the visable entry point that uses HW optimized call pointer
+; creates the init of the HW optimized call pointer
+;;;;
+%macro mbin_interface 1
+ ;;;;
+ ; *_dispatched is defaulted to *_mbinit and replaced on first call.
+ ; Therefore, *_dispatch_init is only executed on first call.
+ ;;;;
+ section .data
+ %1_dispatched:
+ mbin_def_ptr %1_mbinit
+
+ section .text
+ mk_global %1, function
+ %1_mbinit:
+ ;;; only called the first time to setup hardware match
+ call %1_dispatch_init
+ ;;; falls thru to execute the hw optimized code
+ %1:
+ jmp mbin_ptr_sz [%1_dispatched]
+%endmacro
+
+;;;;;
+; mbin_dispatch_init parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+;;;;;
+%macro mbin_dispatch_init 4
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+ mov eax, 1
+ cpuid
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+ jne _%1_init_done ; AVX is not available so end
+ mov mbin_rsi, mbin_rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _%1_init_done
+ lea mbin_rsi, [%2 WRT_OPT]
+
+ _%1_init_done:
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init2 parameters
+; Cases where only base functions are available
+; 1-> function name
+; 2-> base function
+;;;;;
+%macro mbin_dispatch_init2 2
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init5 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+;;;;;
+%macro mbin_dispatch_init5 5
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ ; Test for SSE4.1
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
+ jne _%1_init_done ; AVX is not available so end
+ mov mbin_rsi, mbin_rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _%1_init_done
+ lea mbin_rsi, [%3 WRT_OPT]
+
+ _%1_init_done:
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+%if AS_FEATURE_LEVEL >= 6
+;;;;;
+; mbin_dispatch_init6 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+;;;;;
+%macro mbin_dispatch_init6 6
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ je _%1_init_done ; Use base function if no SSE4_1
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+ lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+%else
+%macro mbin_dispatch_init6 6
+ mbin_dispatch_init5 %1, %2, %3, %4, %5
+%endmacro
+%endif
+
+%if AS_FEATURE_LEVEL >= 10
+;;;;;
+; mbin_dispatch_init7 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> AVX512 Update/10 opt func
+;;;;;
+%macro mbin_dispatch_init7 7
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_2
+ je _%1_init_done ; Use base function if no SSE4_2
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+ lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ and ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2
+ lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+%else
+%macro mbin_dispatch_init7 7
+ mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
+%endmacro
+%endif
+
+;;;;;
+; mbin_dispatch_sse_to_avx2_shani parameters
+; derived from mbin_dispatch_init
+; Use this function when SSE/00/01 is a minimum requirement
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+; 5-> SHANI opt for GLM
+;;;;;
+%macro mbin_dispatch_sse_to_avx2_shani 5
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+ mov eax, 1
+ cpuid
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+ jne _%1_shani_check ; AVX is not available so check shani
+ mov mbin_rsi, mbin_rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _%1_init_done
+ lea mbin_rsi, [%2 WRT_OPT]
+
+ _%1_init_done:
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+
+ _%1_shani_check:
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_SHA
+ lea mbin_rbx, [%5 WRT_OPT] ; SHANI opt func
+ cmovne mbin_rsi, mbin_rbx
+ jmp _%1_init_done ; end
+%endmacro
+
+;;;;;
+; mbin_dispatch_base_to_avx512_shani parameters
+; derived from mbin_dispatch_init6
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_2 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> SHANI opt for GLM
+; 8-> SHANI opt for CNL
+;;;;;
+%macro mbin_dispatch_base_to_avx512_shani 8
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_2
+ je _%1_init_done ; Use base function if no SSE4_2
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_shani_check
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_shani_check
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_shani_check
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+ lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ ;; Test for SHANI
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_SHA
+ lea mbin_rbx, [%8 WRT_OPT] ; SHANI opt sse func
+ cmovne mbin_rsi, mbin_rbx
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+
+ _%1_shani_check:
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_SHA
+ lea mbin_rbx, [%7 WRT_OPT] ; SHANI opt sse func
+ cmovne mbin_rsi, mbin_rbx
+ jmp _%1_init_done ; end
+%endmacro
+
+
+
+%endif ; ifndef _MULTIBINARY_ASM_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm b/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm
new file mode 100644
index 000000000..717dd0503
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm
@@ -0,0 +1,442 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
+%ifndef AS_FEATURE_LEVEL
+%define AS_FEATURE_LEVEL 4
+%endif
+
+%define EFLAGS_HAS_CPUID (1<<21)
+%define FLAG_CPUID1_ECX_CLMUL (1<<1)
+%define FLAG_CPUID1_EDX_SSE2 (1<<26)
+%define FLAG_CPUID1_ECX_SSE3 (1)
+%define FLAG_CPUID1_ECX_SSE4_1 (1<<19)
+%define FLAG_CPUID1_ECX_SSE4_2 (1<<20)
+%define FLAG_CPUID1_ECX_POPCNT (1<<23)
+%define FLAG_CPUID1_ECX_AESNI (1<<25)
+%define FLAG_CPUID1_ECX_OSXSAVE (1<<27)
+%define FLAG_CPUID1_ECX_AVX (1<<28)
+%define FLAG_CPUID1_EBX_AVX2 (1<<5)
+
+%define FLAG_CPUID7_EBX_AVX2 (1<<5)
+%define FLAG_CPUID7_EBX_AVX512F (1<<16)
+%define FLAG_CPUID7_EBX_AVX512DQ (1<<17)
+%define FLAG_CPUID7_EBX_AVX512IFMA (1<<21)
+%define FLAG_CPUID7_EBX_AVX512PF (1<<26)
+%define FLAG_CPUID7_EBX_AVX512ER (1<<27)
+%define FLAG_CPUID7_EBX_AVX512CD (1<<28)
+%define FLAG_CPUID7_EBX_SHA (1<<29)
+%define FLAG_CPUID7_EBX_AVX512BW (1<<30)
+%define FLAG_CPUID7_EBX_AVX512VL (1<<31)
+
+%define FLAG_CPUID7_ECX_AVX512VBMI (1<<1)
+%define FLAG_CPUID7_ECX_AVX512VBMI2 (1 << 6)
+%define FLAG_CPUID7_ECX_GFNI (1 << 8)
+%define FLAG_CPUID7_ECX_VAES (1 << 9)
+%define FLAG_CPUID7_ECX_VPCLMULQDQ (1 << 10)
+%define FLAG_CPUID7_ECX_VNNI (1 << 11)
+%define FLAG_CPUID7_ECX_BITALG (1 << 12)
+%define FLAG_CPUID7_ECX_VPOPCNTDQ (1 << 14)
+
+%define FLAGS_CPUID7_EBX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ)
+%define FLAGS_CPUID7_ECX_AVX512_G2 (FLAG_CPUID7_ECX_AVX512VBMI2 | FLAG_CPUID7_ECX_GFNI | FLAG_CPUID7_ECX_VAES | FLAG_CPUID7_ECX_VPCLMULQDQ | FLAG_CPUID7_ECX_VNNI | FLAG_CPUID7_ECX_BITALG | FLAG_CPUID7_ECX_VPOPCNTDQ)
+
+%define FLAG_XGETBV_EAX_XMM (1<<1)
+%define FLAG_XGETBV_EAX_YMM (1<<2)
+%define FLAG_XGETBV_EAX_XMM_YMM 0x6
+%define FLAG_XGETBV_EAX_ZMM_OPM 0xe0
+
+%define FLAG_CPUID1_EAX_AVOTON 0x000406d0
+%define FLAG_CPUID1_EAX_STEP_MASK 0xfffffff0
+
+; define d and w variants for registers
+
+%define raxd eax
+%define raxw ax
+%define raxb al
+
+%define rbxd ebx
+%define rbxw bx
+%define rbxb bl
+
+%define rcxd ecx
+%define rcxw cx
+%define rcxb cl
+
+%define rdxd edx
+%define rdxw dx
+%define rdxb dl
+
+%define rsid esi
+%define rsiw si
+%define rsib sil
+
+%define rdid edi
+%define rdiw di
+%define rdib dil
+
+%define rbpd ebp
+%define rbpw bp
+%define rbpb bpl
+
+%define zmm0x xmm0
+%define zmm1x xmm1
+%define zmm2x xmm2
+%define zmm3x xmm3
+%define zmm4x xmm4
+%define zmm5x xmm5
+%define zmm6x xmm6
+%define zmm7x xmm7
+%define zmm8x xmm8
+%define zmm9x xmm9
+%define zmm10x xmm10
+%define zmm11x xmm11
+%define zmm12x xmm12
+%define zmm13x xmm13
+%define zmm14x xmm14
+%define zmm15x xmm15
+%define zmm16x xmm16
+%define zmm17x xmm17
+%define zmm18x xmm18
+%define zmm19x xmm19
+%define zmm20x xmm20
+%define zmm21x xmm21
+%define zmm22x xmm22
+%define zmm23x xmm23
+%define zmm24x xmm24
+%define zmm25x xmm25
+%define zmm26x xmm26
+%define zmm27x xmm27
+%define zmm28x xmm28
+%define zmm29x xmm29
+%define zmm30x xmm30
+%define zmm31x xmm31
+
+%define ymm0x xmm0
+%define ymm1x xmm1
+%define ymm2x xmm2
+%define ymm3x xmm3
+%define ymm4x xmm4
+%define ymm5x xmm5
+%define ymm6x xmm6
+%define ymm7x xmm7
+%define ymm8x xmm8
+%define ymm9x xmm9
+%define ymm10x xmm10
+%define ymm11x xmm11
+%define ymm12x xmm12
+%define ymm13x xmm13
+%define ymm14x xmm14
+%define ymm15x xmm15
+%define ymm16x xmm16
+%define ymm17x xmm17
+%define ymm18x xmm18
+%define ymm19x xmm19
+%define ymm20x xmm20
+%define ymm21x xmm21
+%define ymm22x xmm22
+%define ymm23x xmm23
+%define ymm24x xmm24
+%define ymm25x xmm25
+%define ymm26x xmm26
+%define ymm27x xmm27
+%define ymm28x xmm28
+%define ymm29x xmm29
+%define ymm30x xmm30
+%define ymm31x xmm31
+
+%define xmm0x xmm0
+%define xmm1x xmm1
+%define xmm2x xmm2
+%define xmm3x xmm3
+%define xmm4x xmm4
+%define xmm5x xmm5
+%define xmm6x xmm6
+%define xmm7x xmm7
+%define xmm8x xmm8
+%define xmm9x xmm9
+%define xmm10x xmm10
+%define xmm11x xmm11
+%define xmm12x xmm12
+%define xmm13x xmm13
+%define xmm14x xmm14
+%define xmm15x xmm15
+%define xmm16x xmm16
+%define xmm17x xmm17
+%define xmm18x xmm18
+%define xmm19x xmm19
+%define xmm20x xmm20
+%define xmm21x xmm21
+%define xmm22x xmm22
+%define xmm23x xmm23
+%define xmm24x xmm24
+%define xmm25x xmm25
+%define xmm26x xmm26
+%define xmm27x xmm27
+%define xmm28x xmm28
+%define xmm29x xmm29
+%define xmm30x xmm30
+%define xmm31x xmm31
+
+%define zmm0y ymm0
+%define zmm1y ymm1
+%define zmm2y ymm2
+%define zmm3y ymm3
+%define zmm4y ymm4
+%define zmm5y ymm5
+%define zmm6y ymm6
+%define zmm7y ymm7
+%define zmm8y ymm8
+%define zmm9y ymm9
+%define zmm10y ymm10
+%define zmm11y ymm11
+%define zmm12y ymm12
+%define zmm13y ymm13
+%define zmm14y ymm14
+%define zmm15y ymm15
+%define zmm16y ymm16
+%define zmm17y ymm17
+%define zmm18y ymm18
+%define zmm19y ymm19
+%define zmm20y ymm20
+%define zmm21y ymm21
+%define zmm22y ymm22
+%define zmm23y ymm23
+%define zmm24y ymm24
+%define zmm25y ymm25
+%define zmm26y ymm26
+%define zmm27y ymm27
+%define zmm28y ymm28
+%define zmm29y ymm29
+%define zmm30y ymm30
+%define zmm31y ymm31
+
+%define xmm0y ymm0
+%define xmm1y ymm1
+%define xmm2y ymm2
+%define xmm3y ymm3
+%define xmm4y ymm4
+%define xmm5y ymm5
+%define xmm6y ymm6
+%define xmm7y ymm7
+%define xmm8y ymm8
+%define xmm9y ymm9
+%define xmm10y ymm10
+%define xmm11y ymm11
+%define xmm12y ymm12
+%define xmm13y ymm13
+%define xmm14y ymm14
+%define xmm15y ymm15
+%define xmm16y ymm16
+%define xmm17y ymm17
+%define xmm18y ymm18
+%define xmm19y ymm19
+%define xmm20y ymm20
+%define xmm21y ymm21
+%define xmm22y ymm22
+%define xmm23y ymm23
+%define xmm24y ymm24
+%define xmm25y ymm25
+%define xmm26y ymm26
+%define xmm27y ymm27
+%define xmm28y ymm28
+%define xmm29y ymm29
+%define xmm30y ymm30
+%define xmm31y ymm31
+
+%define xmm0z zmm0
+%define xmm1z zmm1
+%define xmm2z zmm2
+%define xmm3z zmm3
+%define xmm4z zmm4
+%define xmm5z zmm5
+%define xmm6z zmm6
+%define xmm7z zmm7
+%define xmm8z zmm8
+%define xmm9z zmm9
+%define xmm10z zmm10
+%define xmm11z zmm11
+%define xmm12z zmm12
+%define xmm13z zmm13
+%define xmm14z zmm14
+%define xmm15z zmm15
+%define xmm16z zmm16
+%define xmm17z zmm17
+%define xmm18z zmm18
+%define xmm19z zmm19
+%define xmm20z zmm20
+%define xmm21z zmm21
+%define xmm22z zmm22
+%define xmm23z zmm23
+%define xmm24z zmm24
+%define xmm25z zmm25
+%define xmm26z zmm26
+%define xmm27z zmm27
+%define xmm28z zmm28
+%define xmm29z zmm29
+%define xmm30z zmm30
+%define xmm31z zmm31
+
+%define ymm0z zmm0
+%define ymm1z zmm1
+%define ymm2z zmm2
+%define ymm3z zmm3
+%define ymm4z zmm4
+%define ymm5z zmm5
+%define ymm6z zmm6
+%define ymm7z zmm7
+%define ymm8z zmm8
+%define ymm9z zmm9
+%define ymm10z zmm10
+%define ymm11z zmm11
+%define ymm12z zmm12
+%define ymm13z zmm13
+%define ymm14z zmm14
+%define ymm15z zmm15
+%define ymm16z zmm16
+%define ymm17z zmm17
+%define ymm18z zmm18
+%define ymm19z zmm19
+%define ymm20z zmm20
+%define ymm21z zmm21
+%define ymm22z zmm22
+%define ymm23z zmm23
+%define ymm24z zmm24
+%define ymm25z zmm25
+%define ymm26z zmm26
+%define ymm27z zmm27
+%define ymm28z zmm28
+%define ymm29z zmm29
+%define ymm30z zmm30
+%define ymm31z zmm31
+
+%define DWORD(reg) reg %+ d
+%define WORD(reg) reg %+ w
+%define BYTE(reg) reg %+ b
+
+%define XWORD(reg) reg %+ x
+%define YWORD(reg) reg %+ y
+%define ZWORD(reg) reg %+ z
+
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+ %define __x86_64__
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,win64
+ %define __x86_64__
+%endif
+%ifidn __OUTPUT_FORMAT__,macho64
+ %define __x86_64__
+%endif
+
+%ifdef __x86_64__
+ %define endbranch db 0xf3, 0x0f, 0x1e, 0xfa
+%else
+ %define endbranch db 0xf3, 0x0f, 0x1e, 0xfb
+%endif
+
+%ifdef REL_TEXT
+ %define WRT_OPT
+%elifidn __OUTPUT_FORMAT__, elf64
+ %define WRT_OPT wrt ..plt
+%else
+ %define WRT_OPT
+%endif
+
+%macro mk_global 1-3
+ %ifdef __NASM_VER__
+ %ifidn __OUTPUT_FORMAT__, macho64
+ global %1
+ %elifidn __OUTPUT_FORMAT__, win64
+ global %1
+ %else
+ global %1:%2 %3
+ %endif
+ %else
+ global %1:%2 %3
+ %endif
+%endmacro
+
+
+; Fixes for nasm lack of MS proc helpers
+%ifdef __NASM_VER__
+ %ifidn __OUTPUT_FORMAT__, win64
+ %macro alloc_stack 1
+ sub rsp, %1
+ %endmacro
+
+ %macro proc_frame 1
+ %1:
+ %endmacro
+
+ %macro save_xmm128 2
+ movdqa [rsp + %2], %1
+ %endmacro
+
+ %macro save_reg 2
+ mov [rsp + %2], %1
+ %endmacro
+
+ %macro rex_push_reg 1
+ push %1
+ %endmacro
+
+ %macro push_reg 1
+ push %1
+ %endmacro
+
+ %define end_prolog
+ %endif
+
+ %define endproc_frame
+%endif
+
+%ifidn __OUTPUT_FORMAT__, macho64
+ %define elf64 macho64
+ mac_equ equ 1
+%endif
+
+%macro slversion 4
+ section .text
+ global %1_slver_%2%3%4
+ global %1_slver
+ %1_slver:
+ %1_slver_%2%3%4:
+ dw 0x%4
+ db 0x%3, 0x%2
+%endmacro
+
+%endif ; ifndef _REG_SIZES_ASM_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h b/src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h
new file mode 100644
index 000000000..035cf1701
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/rolling_hashx.h
@@ -0,0 +1,114 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/**
+ * @file rolling_hashx.h
+ * @brief Fingerprint functions based on rolling hash
+ *
+ * rolling_hash2 - checks hash in a sliding window based on random 64-bit hash.
+ */
+
+#ifndef _ROLLING_HASHX_H_
+#define _ROLLING_HASHX_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+/**
+ *@brief rolling hash return values
+ */
+enum {
+ FINGERPRINT_RET_HIT = 0, //!< Fingerprint trigger hit
+ FINGERPRINT_RET_MAX, //!< Fingerprint max length reached before hit
+ FINGERPRINT_RET_OTHER //!< Fingerprint function error returned
+};
+
+#define FINGERPRINT_MAX_WINDOW 48
+
+/**
+ * @brief Context for rolling_hash2 functions
+ */
+struct rh_state2 {
+ uint8_t history[FINGERPRINT_MAX_WINDOW];
+ uint64_t table1[256];
+ uint64_t table2[256];
+ uint64_t hash;
+ uint32_t w;
+};
+
+/**
+ * @brief Initialize state object for rolling hash2
+ *
+ * @param state Structure holding state info on current rolling hash
+ * @param w Window width (1 <= w <= 32)
+ * @returns 0 - success, -1 - failure
+ */
+int rolling_hash2_init(struct rh_state2 *state, uint32_t w);
+
+/**
+ * @brief Reset the hash state history
+ *
+ * @param state Structure holding state info on current rolling hash
+ * @param init_bytes Optional window size buffer to pre-init hash
+ * @returns none
+ */
+void rolling_hash2_reset(struct rh_state2 *state, uint8_t * init_bytes);
+
+/**
+ * @brief Run rolling hash function until trigger met or max length reached
+ *
+ * Checks for trigger based on a random hash in a sliding window.
+ * @param state Structure holding state info on current rolling hash
+ * @param buffer Pointer to input buffer to run windowed hash on
+ * @param max_len Max length to run over input
+ * @param mask Mask bits ORed with hash before test with trigger
+ * @param trigger Match value to compare with windowed hash at each input byte
+ * @param offset Offset from buffer to match, set if match found
+ * @returns FINGERPRINT_RET_HIT - match found, FINGERPRINT_RET_MAX - exceeded max length
+ */
+int rolling_hash2_run(struct rh_state2 *state, uint8_t * buffer, uint32_t max_len,
+ uint32_t mask, uint32_t trigger, uint32_t * offset);
+
+/**
+ * @brief Generate an appropriate mask to target mean hit rate
+ *
+ * @param mean Target chunk size in bytes
+ * @param shift Bits to rotate result to get independent masks
+ * @returns 32-bit mask value
+ */
+uint32_t rolling_hashx_mask_gen(long mean, int shift);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _ROLLING_HASHX_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h
new file mode 100644
index 000000000..3a41684b4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h
@@ -0,0 +1,450 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SHA1_MB_H_
+#define _SHA1_MB_H_
+
+/**
+ * @file sha1_mb.h
+ * @brief Multi-buffer CTX API SHA1 function prototypes and structures
+ *
+ * Interface for multi-buffer SHA1 functions
+ *
+ * <b> Multi-buffer SHA1 Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the SHA1_HASH_CTX_MGR and
+ * SHA1_HASH_CTX objects. Numerous SHA1_HASH_CTX objects may be instantiated by the
+ * application for use with a single SHA1_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The SHA1 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and
+ * AVX512. In addition, a multibinary interface is provided, which selects the appropriate
+ * architecture-specific function at runtime.
+ *
+ * <b>Usage:</b> The application creates a SHA1_HASH_CTX_MGR object and initializes it
+ * with a call to sha1_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the
+ * multibinary version). The SHA1_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 4 SHA1_HASH_CTX objects (or 8 in the AVX2 case, 16 in the AVX512)
+ * being processed at a time.
+ *
+ * Each SHA1_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the SHA1_HASH_CTX to a SHA1_HASH_CTX_MGR using the submit functions
+ * sha1_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA1_HASH_CTX is
+ * returned to the application (via this or a later call to sha1_ctx_mgr_submit*() or
+ * sha1_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * sha1_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, sha1_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a SHA1_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The SHA1_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ * <li> one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE)
+ * <li> two buffers: HASH_FIRST, HASH_LAST
+ * <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which SHA1_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ * <li> Submitting flags other than the allowed entire/first/update/last values
+ * <li> Submitting a context that is currently being managed by a SHA1_HASH_CTX_MGR.
+ * <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ * These error conditions are reported by returning the SHA1_HASH_CTX immediately after
+ * a submit with its error member set to a non-zero error code (defined in
+ * multi_buffer.h). No changes are made to the SHA1_HASH_CTX_MGR in the case of an
+ * error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define SHA1_DIGEST_NWORDS 5
+#define SHA1_MAX_LANES 16
+#define SHA1_X8_LANES 8
+#define SHA1_MIN_LANES 4
+#define SHA1_BLOCK_SIZE 64
+#define SHA1_LOG2_BLOCK_SIZE 6
+#define SHA1_PADLENGTHFIELD_SIZE 8
+#define SHA1_INITIAL_DIGEST \
+ 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0
+
+typedef uint32_t sha1_digest_array[SHA1_DIGEST_NWORDS][SHA1_MAX_LANES];
+typedef uint32_t SHA1_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SHA1 job for the multi-buffer manager */
+
+typedef struct {
+ uint8_t* buffer; //!< pointer to data buffer for this job
+ uint32_t len; //!< length of buffer for this job in blocks.
+ DECLARE_ALIGNED(uint32_t result_digest[SHA1_DIGEST_NWORDS],64);
+ JOB_STS status; //!< output job status
+ void* user_data; //!< pointer for user's job-related data
+} SHA1_JOB;
+
+/** @brief Scheduler layer - Holds arguments for submitted SHA1 job */
+
+typedef struct {
+ sha1_digest_array digest;
+ uint8_t* data_ptr[SHA1_MAX_LANES];
+} SHA1_MB_ARGS_X16;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+ SHA1_JOB *job_in_lane;
+} SHA1_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SHA1 jobs */
+
+typedef struct {
+ SHA1_MB_ARGS_X16 args;
+ uint32_t lens[SHA1_MAX_LANES];
+ uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7 or 0...15) of unused lanes, nibble 4 or 8 is set to F as a flag
+ SHA1_LANE_DATA ldata[SHA1_MAX_LANES];
+ uint32_t num_lanes_inuse;
+} SHA1_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SHA1 jobs */
+
+typedef struct {
+ SHA1_MB_JOB_MGR mgr;
+} SHA1_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SHA1 job for the multi-buffer CTX manager */
+
+typedef struct {
+ SHA1_JOB job; // Must be at struct offset 0.
+ HASH_CTX_STS status; //!< Context status flag
+ HASH_CTX_ERROR error; //!< Context error flag
+ uint64_t total_length; //!< Running counter of length processed for this CTX's job
+ const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job
+ uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes.
+ uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2]; //!< CTX partial blocks
+ uint32_t partial_block_buffer_length;
+ void* user_data; //!< pointer for user to keep any job-related data
+} SHA1_HASH_CTX;
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush (SHA1_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Context level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level SHA1 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_sse (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_sse (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_sse (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the context level SHA1 multi-buffer manager structure.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_sse_ni (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the context level multi-buffer manager.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_sse_ni (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_sse_ni (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_avx (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_avx2 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx2 (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx2 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_avx512 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx512 (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx512 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_avx512_ni (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx512_ni (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx512_ni (SHA1_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void sha1_mb_mgr_init_sse (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_sse (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_sse (SHA1_MB_JOB_MGR *state);
+
+#define sha1_mb_mgr_init_avx sha1_mb_mgr_init_sse
+SHA1_JOB* sha1_mb_mgr_submit_avx (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx (SHA1_MB_JOB_MGR *state);
+
+void sha1_mb_mgr_init_avx2 (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_avx2 (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx2 (SHA1_MB_JOB_MGR *state);
+
+void sha1_mb_mgr_init_avx512 (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_avx512 (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx512 (SHA1_MB_JOB_MGR *state);
+
+void sha1_mb_mgr_init_sse_ni (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_sse_ni (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_sse_ni (SHA1_MB_JOB_MGR *state);
+
+void sha1_mb_mgr_init_avx512_ni (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_avx512_ni (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx512_ni (SHA1_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _SHA1_MB_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h
new file mode 100644
index 000000000..8ef186b2d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h
@@ -0,0 +1,451 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SHA256_MB_H_
+#define _SHA256_MB_H_
+
+/**
+ * @file sha256_mb.h
+ * @brief Multi-buffer CTX API SHA256 function prototypes and structures
+ *
+ * Interface for multi-buffer SHA256 functions
+ *
+ * <b> Multi-buffer SHA256 Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the SHA256_HASH_CTX_MGR and
+ * SHA256_HASH_CTX objects. Numerous SHA256_HASH_CTX objects may be instantiated by the
+ * application for use with a single SHA256_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The SHA256 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and
+ * AVX512. In addition, a multibinary interface is provided, which selects the appropriate
+ * architecture-specific function at runtime.
+ *
+ * <b>Usage:</b> The application creates a SHA256_HASH_CTX_MGR object and initializes it
+ * with a call to sha256_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the
+ * multibinary version). The SHA256_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 4 SHA256_HASH_CTX objects (or 8 in the AVX2 case, 16 in the AVX512)
+ * being processed at a time.
+ *
+ * Each SHA256_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the SHA256_HASH_CTX to a SHA256_HASH_CTX_MGR using the submit functions
+ * sha256_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA256_HASH_CTX is
+ * returned to the application (via this or a later call to sha256_ctx_mgr_submit*() or
+ * sha256_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * sha256_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, sha256_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a SHA256_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The SHA256_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ * <li> one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE)
+ * <li> two buffers: HASH_FIRST, HASH_LAST
+ * <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which SHA256_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ * <li> Submitting flags other than the allowed entire/first/update/last values
+ * <li> Submitting a context that is currently being managed by a SHA256_HASH_CTX_MGR.
+ * <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ * These error conditions are reported by returning the SHA256_HASH_CTX immediately after
+ * a submit with its error member set to a non-zero error code (defined in
+ * multi_buffer.h). No changes are made to the SHA256_HASH_CTX_MGR in the case of an
+ * error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define SHA256_DIGEST_NWORDS 8
+#define SHA256_MAX_LANES 16
+#define SHA256_X8_LANES 8
+#define SHA256_MIN_LANES 4
+#define SHA256_BLOCK_SIZE 64
+#define SHA256_LOG2_BLOCK_SIZE 6
+#define SHA256_PADLENGTHFIELD_SIZE 8
+#define SHA256_INITIAL_DIGEST \
+ 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, \
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+typedef uint32_t sha256_digest_array[SHA256_DIGEST_NWORDS][SHA256_MAX_LANES];
+typedef uint32_t SHA256_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SHA256 job for the multi-buffer manager */
+
+typedef struct {
+ uint8_t* buffer; //!< pointer to data buffer for this job
+ uint64_t len; //!< length of buffer for this job in blocks.
+ DECLARE_ALIGNED(uint32_t result_digest[SHA256_DIGEST_NWORDS], 64);
+ JOB_STS status; //!< output job status
+ void* user_data; //!< pointer for user's job-related data
+} SHA256_JOB;
+
+/** @brief Scheduler layer - Holds arguments for submitted SHA256 job */
+
+typedef struct {
+ sha256_digest_array digest;
+ uint8_t* data_ptr[SHA256_MAX_LANES];
+} SHA256_MB_ARGS_X16;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+ SHA256_JOB *job_in_lane;
+} SHA256_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SHA256 jobs */
+
+typedef struct {
+ SHA256_MB_ARGS_X16 args;
+ uint32_t lens[SHA256_MAX_LANES];
+ uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7) of unused lanes, nibble 4 or 8 is set to F as a flag
+ SHA256_LANE_DATA ldata[SHA256_MAX_LANES];
+ uint32_t num_lanes_inuse;
+} SHA256_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SHA256 jobs */
+
+typedef struct {
+ SHA256_MB_JOB_MGR mgr;
+} SHA256_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SHA256 job for the multi-buffer CTX manager */
+
+typedef struct {
+ SHA256_JOB job; // Must be at struct offset 0.
+ HASH_CTX_STS status; //!< Context status flag
+ HASH_CTX_ERROR error; //!< Context error flag
+ uint64_t total_length; //!< Running counter of length processed for this CTX's job
+ const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job
+ uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes.
+ uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2]; //!< CTX partial blocks
+ uint32_t partial_block_buffer_length;
+ void* user_data; //!< pointer for user to keep any job-related data
+} SHA256_HASH_CTX;
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush (SHA256_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * CTX level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level SHA256 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_sse (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_sse (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_sse (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the context level SHA256 multi-buffer manager structure.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_sse_ni (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the context level multi-buffer manager.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_sse_ni (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires SSE4.1 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_sse_ni (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_avx (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_avx2 (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx2 (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx2 (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_avx512 (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx512 (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx512 (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_avx512_ni (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx512_ni (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX512 and SHANI
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx512_ni (SHA256_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void sha256_mb_mgr_init_sse (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_sse (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_sse (SHA256_MB_JOB_MGR *state);
+
+#define sha256_mb_mgr_init_avx sha256_mb_mgr_init_sse
+SHA256_JOB* sha256_mb_mgr_submit_avx (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx (SHA256_MB_JOB_MGR *state);
+
+void sha256_mb_mgr_init_avx2 (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_avx2 (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx2 (SHA256_MB_JOB_MGR *state);
+
+void sha256_mb_mgr_init_avx512 (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_avx512 (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx512 (SHA256_MB_JOB_MGR *state);
+
+void sha256_mb_mgr_init_sse_ni (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_sse_ni (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_sse_ni (SHA256_MB_JOB_MGR *state);
+
+void sha256_mb_mgr_init_avx512_ni (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_avx512_ni (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx512_ni (SHA256_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _SHA256_MB_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h
new file mode 100644
index 000000000..ce3950ad1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h
@@ -0,0 +1,422 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SHA512_MB_H_
+#define _SHA512_MB_H_
+
+/**
+ * @file sha512_mb.h
+ * @brief Single/Multi-buffer CTX API SHA512 function prototypes and structures
+ *
+ * Interface for single and multi-buffer SHA512 functions
+ *
+ * <b> Single/Multi-buffer SHA512 Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this single/multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the SHA512_HASH_CTX_MGR and
+ * SHA512_HASH_CTX objects. Numerous SHA512_HASH_CTX objects may be instantiated by the
+ * application for use with a single SHA512_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The SHA512 CTX interface functions are available for 5 architectures: multi-buffer SSE,
+ * AVX, AVX2, AVX512 and single-buffer SSE4 (which is used in the same way as the
+ * multi-buffer code). In addition, a multibinary interface is provided, which selects the
+ * appropriate architecture-specific function at runtime. This multibinary interface
+ * selects the single buffer SSE4 functions when the platform is detected to be Silvermont.
+ *
+ * <b>Usage:</b> The application creates a SHA512_HASH_CTX_MGR object and initializes it
+ * with a call to sha512_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the
+ * multibinary version). The SHA512_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 2 SHA512_HASH_CTX objects (or 4 in the AVX2 case, 8 in the AVX512
+ * case) being processed at a time.
+ *
+ * Each SHA512_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the SHA512_HASH_CTX to a SHA512_HASH_CTX_MGR using the submit functions
+ * sha512_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA512_HASH_CTX is
+ * returned to the application (via this or a later call to sha512_ctx_mgr_submit*() or
+ * sha512_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * sha512_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, sha512_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a SHA512_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The SHA512_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ * <li> one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE)
+ * <li> two buffers: HASH_FIRST, HASH_LAST
+ * <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which SHA512_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ * <li> Submitting flags other than the allowed entire/first/update/last values
+ * <li> Submitting a context that is currently being managed by a SHA512_HASH_CTX_MGR. (Note:
+ * This error case is not applicable to the single buffer SSE4 version)
+ * <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ * These error conditions are reported by returning the SHA512_HASH_CTX immediately after
+ * a submit with its error member set to a non-zero error code (defined in
+ * multi_buffer.h). No changes are made to the SHA512_HASH_CTX_MGR in the case of an
+ * error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define SHA512_DIGEST_NWORDS 8
+#define SHA512_MAX_LANES 8
+#define SHA512_X4_LANES 4
+#define SHA512_MIN_LANES 2
+#define SHA512_BLOCK_SIZE 128
+#define SHA512_LOG2_BLOCK_SIZE 7
+#define SHA512_PADLENGTHFIELD_SIZE 16
+#define SHA512_INITIAL_DIGEST \
+ 0x6a09e667f3bcc908,0xbb67ae8584caa73b,0x3c6ef372fe94f82b,0xa54ff53a5f1d36f1, \
+ 0x510e527fade682d1,0x9b05688c2b3e6c1f,0x1f83d9abfb41bd6b,0x5be0cd19137e2179
+
+
+typedef uint64_t sha512_digest_array[SHA512_DIGEST_NWORDS][SHA512_MAX_LANES];
+typedef uint64_t SHA512_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SHA512 job for the multi-buffer manager */
+
+typedef struct {
+ uint8_t* buffer; //!< pointer to data buffer for this job
+ uint64_t len; //!< length of buffer for this job in blocks.
+ DECLARE_ALIGNED(uint64_t result_digest[SHA512_DIGEST_NWORDS], 64);
+ JOB_STS status; //!< output job status
+ void* user_data; //!< pointer for user's job-related data
+} SHA512_JOB;
+
+/** @brief Scheduler layer - Holds arguments for submitted SHA512 job */
+
+typedef struct {
+ sha512_digest_array digest;
+ uint8_t* data_ptr[SHA512_MAX_LANES];
+} SHA512_MB_ARGS_X8;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+ SHA512_JOB *job_in_lane;
+} SHA512_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SHA512 jobs */
+
+typedef struct {
+ SHA512_MB_ARGS_X8 args;
+ uint64_t lens[SHA512_MAX_LANES];
+ uint64_t unused_lanes; //!< each byte is index (00, 01 or 00...03) of unused lanes, byte 2 or 4 is set to FF as a flag
+ SHA512_LANE_DATA ldata[SHA512_MAX_LANES];
+ uint32_t num_lanes_inuse;
+} SHA512_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SHA512 jobs */
+
+typedef struct {
+ SHA512_MB_JOB_MGR mgr;
+} SHA512_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SHA512 job for the multi-buffer CTX manager */
+
+typedef struct {
+ SHA512_JOB job; // Must be at struct offset 0.
+ HASH_CTX_STS status; //!< Context status flag
+ HASH_CTX_ERROR error; //!< Context error flag
+ uint64_t total_length; //!< Running counter of length processed for this CTX's job
+ const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job
+ uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes.
+ uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2]; //!< CTX partial blocks
+ uint32_t partial_block_buffer_length;
+ void* user_data; //!< pointer for user to keep any job-related data
+} SHA512_HASH_CTX;
+
+/*******************************************************************
+ * Context level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level SHA512 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_sse (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_sse (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_sse (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_avx (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_avx2 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx2 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx2 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_avx512 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx512 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx512 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires SSE4
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_sb_sse4 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires SSE4
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_sb_sse4 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires SSE4
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_sb_sse4 (SHA512_HASH_CTX_MGR* mgr);
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush (SHA512_HASH_CTX_MGR* mgr);
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void sha512_mb_mgr_init_sse (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_mb_mgr_submit_sse (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_sse (SHA512_MB_JOB_MGR *state);
+
+#define sha512_mb_mgr_init_avx sha512_mb_mgr_init_sse
+SHA512_JOB* sha512_mb_mgr_submit_avx (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_avx (SHA512_MB_JOB_MGR *state);
+
+void sha512_mb_mgr_init_avx2 (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_mb_mgr_submit_avx2 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_avx2 (SHA512_MB_JOB_MGR *state);
+
+void sha512_mb_mgr_init_avx512 (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_mb_mgr_submit_avx512 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_avx512 (SHA512_MB_JOB_MGR *state);
+
+// Single buffer SHA512 APIs, optimized for SLM.
+void sha512_sse4 (const void* M, void* D, uint64_t L);
+// Note that these APIs comply with multi-buffer APIs' high level usage
+void sha512_sb_mgr_init_sse4 (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_sb_mgr_submit_sse4 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_sb_mgr_flush_sse4 (SHA512_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _SHA512_MB_H_
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h
new file mode 100644
index 000000000..d9e7b4eed
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sm3_mb.h
@@ -0,0 +1,155 @@
+/**********************************************************************
+ Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SM3_MB_H_
+#define _SM3_MB_H_
+
+
+/**
+ * @file sm3_mb.h
+ * @brief Multi-buffer CTX API SM3 function prototypes and structures
+ *
+ * \warning Experimental interface with only base functions available at this
+ * time.
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SM3_DIGEST_NWORDS 8 /* Word in SM3 is 32-bit */
+#define SM3_MAX_LANES 16
+#define SM3_X8_LANES 8
+#define SM3_BLOCK_SIZE 64
+#define SM3_LOG2_BLOCK_SIZE 6
+#define SM3_PADLENGTHFIELD_SIZE 8
+#define SM3_INITIAL_DIGEST \
+ 0x7380166f, 0x4914b2b9, 0x172442d7, 0xda8a0600, \
+ 0xa96f30bc, 0x163138aa, 0xe38dee4d, 0xb0fb0e4e
+
+typedef uint32_t sm3_digest_array[SM3_DIGEST_NWORDS][SM3_MAX_LANES];
+typedef uint32_t SM3_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SM3 job for the multi-buffer manager */
+
+typedef struct {
+ uint8_t *buffer; //!< pointer to data buffer for this job
+ uint64_t len; //!< length of buffer for this job in blocks.
+ DECLARE_ALIGNED(uint32_t result_digest[SM3_DIGEST_NWORDS], 64);
+ JOB_STS status; //!< output job status
+ void *user_data; //!< pointer for user's job-related data
+} SM3_JOB;
+
+/** @brief Scheduler layer - Holds arguments for submitted SM3 job */
+
+typedef struct {
+ sm3_digest_array digest;
+ uint8_t *data_ptr[SM3_MAX_LANES];
+} SM3_MB_ARGS_X16;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+ SM3_JOB *job_in_lane;
+} SM3_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SM3 jobs */
+
+typedef struct {
+ SM3_MB_ARGS_X16 args;
+ uint32_t lens[SM3_MAX_LANES];
+ uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7) of unused lanes, nibble 4 or 8 is set to F as a flag
+ SM3_LANE_DATA ldata[SM3_MAX_LANES];
+ uint32_t num_lanes_inuse;
+} SM3_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SM3 jobs */
+
+typedef struct {
+ SM3_MB_JOB_MGR mgr;
+} SM3_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SM3 job for the multi-buffer CTX manager */
+
+typedef struct {
+ SM3_JOB job; // Must be at struct offset 0.
+ HASH_CTX_STS status; //!< Context status flag
+ HASH_CTX_ERROR error; //!< Context error flag
+ uint64_t total_length; //!< Running counter of length processed for this CTX's job
+ const void *incoming_buffer; //!< pointer to data input buffer for this CTX's job
+ uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes.
+ uint8_t partial_block_buffer[SM3_BLOCK_SIZE * 2]; //!< CTX partial blocks
+ uint32_t partial_block_buffer_length;
+ void *user_data; //!< pointer for user to keep any job-related data
+} SM3_HASH_CTX;
+
+/******************** multibinary function prototypes **********************/
+
+/**
+* @brief Initialize the SM3 multi-buffer manager structure.
+*
+* @param mgr Structure holding context level state info
+* @returns void
+*/
+void sm3_ctx_mgr_init(SM3_HASH_CTX_MGR * mgr);
+
+/**
+* @brief Submit a new SM3 job to the multi-buffer manager.
+*
+* @param mgr Structure holding context level state info
+* @param ctx Structure holding ctx job info
+* @param buffer Pointer to buffer to be processed
+* @param len Length of buffer (in bytes) to be processed
+* @param flags Input flag specifying job type (first, update, last or entire)
+* @returns NULL if no jobs complete or pointer to jobs structure.
+*/
+SM3_HASH_CTX *sm3_ctx_mgr_submit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags);
+
+/**
+* @brief Finish all submitted SM3 jobs and return when complete.
+*
+* @param mgr Structure holding context level state info
+* @returns NULL if no jobs to complete or pointer to jobs structure.
+*/
+SM3_HASH_CTX *sm3_ctx_mgr_flush(SM3_HASH_CTX_MGR * mgr);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/include/test.h b/src/crypto/isa-l/isa-l_crypto/include/test.h
new file mode 100644
index 000000000..7b99390b8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/test.h
@@ -0,0 +1,111 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _TEST_H
+#define _TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "endian_helper.h"
+
+// Use sys/time.h functions for time
+#if defined (__unix__) || (__APPLE__) || (__MINGW32__)
+# include <sys/time.h>
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+# include <time.h>
+# include <Windows.h>
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+
+struct perf{
+ struct timeval tv;
+};
+
+
+#if defined (__unix__) || (__APPLE__) || (__MINGW32__)
+static inline int perf_start(struct perf *p)
+{
+ return gettimeofday(&(p->tv), 0);
+}
+static inline int perf_stop(struct perf *p)
+{
+ return gettimeofday(&(p->tv), 0);
+}
+
+static inline void perf_print(struct perf stop, struct perf start, long long dsize)
+{
+ long long secs = stop.tv.tv_sec - start.tv.tv_sec;
+ long long usecs = secs * 1000000 + stop.tv.tv_usec - start.tv.tv_usec;
+
+ printf("runtime = %10lld usecs", usecs);
+ if (dsize != 0) {
+#if 1 // not bug in printf for 32-bit
+ printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s\n", dsize/(1024*1024),
+ ((double) usecs)/1000000, ((double) dsize) / (double)usecs);
+#else
+ printf(", bandwidth %lld MB ", dsize/(1024*1024));
+ printf("in %.4f sec ",(double)usecs/1000000);
+ printf("= %.2f MB/s\n", (double)dsize/usecs);
+#endif
+ }
+ else
+ printf("\n");
+}
+#endif
+
+static inline uint64_t get_filesize(FILE *fp)
+{
+ uint64_t file_size;
+ fpos_t pos, pos_curr;
+
+ fgetpos(fp, &pos_curr); /* Save current position */
+#if defined(_WIN32) || defined(_WIN64)
+ _fseeki64(fp, 0, SEEK_END);
+#else
+ fseeko(fp, 0, SEEK_END);
+#endif
+ fgetpos(fp, &pos);
+ file_size = *(uint64_t *)&pos;
+ fsetpos(fp, &pos_curr); /* Restore position */
+
+ return file_size;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _TEST_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/types.h b/src/crypto/isa-l/isa-l_crypto/include/types.h
new file mode 100644
index 000000000..de452557a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/types.h
@@ -0,0 +1,100 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ * @file types.h
+ * @brief Defines common align and debug macros
+ *
+ */
+
+#ifndef __TYPES_H
+#define __TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#if defined __unix__ || defined __APPLE__
+# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+# define __forceinline static inline
+# define aligned_free(x) free(x)
+#else
+# ifdef __MINGW32__
+# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+# define aligned_free(x) _aligned_free(x)
+# else
+# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
+# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+# define aligned_free(x) _aligned_free(x)
+# endif
+#endif
+
+#ifdef DEBUG
+# define DEBUG_PRINT(x) printf x
+#else
+# define DEBUG_PRINT(x) do {} while (0)
+#endif
+
+
+#ifndef __has_feature
+# define __has_feature(x) 0
+#endif
+#ifndef __has_extension
+# define __has_extension __has_feature
+#endif
+#define ISAL_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+
+#if (defined(__ICC) || defined( __GNUC__ ) || defined(__clang__)) && !defined(ISAL_UNIT_TEST)
+# if __has_extension(attribute_deprecated_with_message) \
+ || (ISAL_GCC_VERSION >= 40500) \
+ || (__INTEL_COMPILER >= 1100)
+# define ISAL_DEPRECATED(message) __attribute__(( deprecated( message )))
+# else
+# define ISAL_DEPRECATED(message) __attribute__(( deprecated ))
+# endif
+#elif (defined( __ICL ) || defined(_MSC_VER))
+# if (__INTEL_COMPILER >= 1100) || (_MSC_FULL_VER >= 140050727)
+# define ISAL_DEPRECATED(message) __declspec( deprecated ( message ))
+# else
+# define ISAL_DEPRECATED(message) __declspec( deprecated )
+# endif
+#else
+# define ISAL_DEPRECATED(message)
+#endif
+
+#define ISAL_EXPERIMENTAL(message) ISAL_DEPRECATED("Experimental: " message)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__TYPES_H
diff --git a/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def b/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def
new file mode 100644
index 000000000..de38b6d19
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def
@@ -0,0 +1,80 @@
+LIBRARY isa-l_crypto
+VERSION 2.24
+EXPORTS
+
+sha1_ctx_mgr_init @1
+sha1_ctx_mgr_submit @2
+sha1_ctx_mgr_flush @3
+sha256_ctx_mgr_init @4
+sha256_ctx_mgr_submit @5
+sha256_ctx_mgr_flush @6
+sha512_ctx_mgr_init @7
+sha512_ctx_mgr_submit @8
+sha512_ctx_mgr_flush @9
+md5_ctx_mgr_init @10
+md5_ctx_mgr_submit @11
+md5_ctx_mgr_flush @12
+mh_sha1_init @13
+mh_sha1_update @14
+mh_sha1_finalize @15
+mh_sha1_finalize_base @16
+mh_sha1_update_base @17
+mh_sha1_murmur3_x64_128_init @18
+mh_sha1_murmur3_x64_128_finalize_base @19
+mh_sha1_murmur3_x64_128_update_base @20
+mh_sha1_murmur3_x64_128_update @21
+mh_sha1_murmur3_x64_128_finalize @22
+aes_keyexp_128 @23
+aes_keyexp_192 @24
+aes_keyexp_256 @25
+aes_cbc_enc_128 @26
+aes_cbc_dec_128 @27
+aes_cbc_enc_192 @28
+aes_cbc_dec_192 @29
+aes_cbc_enc_256 @30
+aes_cbc_dec_256 @31
+aes_cbc_precomp @32
+XTS_AES_128_enc @33
+XTS_AES_128_enc_expanded_key @34
+XTS_AES_128_dec @35
+XTS_AES_128_dec_expanded_key @36
+XTS_AES_256_enc @37
+XTS_AES_256_enc_expanded_key @38
+XTS_AES_256_dec @39
+XTS_AES_256_dec_expanded_key @40
+mh_sha256_init @41
+mh_sha256_update @42
+mh_sha256_finalize @43
+mh_sha256_finalize_base @44
+mh_sha256_update_base @45
+rolling_hashx_mask_gen @46
+rolling_hash2_run @47
+rolling_hash2_reset @48
+rolling_hash2_init @49
+aes_gcm_pre_128 @50
+aes_gcm_enc_128 @51
+aes_gcm_dec_128 @52
+aes_gcm_init_128 @53
+aes_gcm_enc_128_update @54
+aes_gcm_dec_128_update @55
+aes_gcm_enc_128_finalize @56
+aes_gcm_dec_128_finalize @57
+aes_gcm_pre_256 @58
+aes_gcm_enc_256 @59
+aes_gcm_dec_256 @60
+aes_gcm_init_256 @61
+aes_gcm_enc_256_update @62
+aes_gcm_dec_256_update @63
+aes_gcm_enc_256_finalize @64
+aes_gcm_dec_256_finalize @65
+aes_gcm_enc_128_nt @66
+aes_gcm_dec_128_nt @67
+aes_gcm_enc_128_update_nt @68
+aes_gcm_dec_128_update_nt @69
+aes_gcm_enc_256_nt @70
+aes_gcm_dec_256_nt @71
+aes_gcm_enc_256_update_nt @72
+aes_gcm_dec_256_update_nt @73
+sm3_ctx_mgr_init @74
+sm3_ctx_mgr_submit @75
+sm3_ctx_mgr_flush @76
diff --git a/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in b/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in
new file mode 100644
index 000000000..41ba8d5a3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libisal_crypto
+Description: Crypto library for storage systems
+Version: @VERSION@
+Libs: -L${libdir} -lisal_crypto
+Libs.private:
+Cflags: -I${includedir}
diff --git a/src/crypto/isa-l/isa-l_crypto/make.inc b/src/crypto/isa-l/isa-l_crypto/make.inc
new file mode 100644
index 000000000..0cb94d12d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/make.inc
@@ -0,0 +1,340 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+
+# Makefile include for optimized libraries
+# make targets:
+# lib - build library of optimized functions
+# slib - build shared library
+# test - run unit tests of functions
+# perf - run performance tests
+# install - install headers and libs to system location
+# sim - run on simulator
+# trace - get simulator trace
+# clean - remove object files
+
+version ?= 2.24.0
+
+
+
+CC = gcc
+AS = nasm
+AWK = awk
+
+DEBUG = -g
+DEBUG_yasm = -g dwarf2
+DEBUG_nasm = -g
+
+# Default arch= build options
+CFLAGS_ = -Wall
+ASFLAGS_ = -f elf64
+ARFLAGS_ = cr $@
+STRIP_gcc = strip -d -R .comment $@
+
+# arch=32 build options
+ASFLAGS_32 = -f elf32
+CFLAGS_32 = -m32
+ARFLAGS_32 = cr $@
+
+# arch=win64 build options
+ASFLAGS_win64 = -f win64
+CFLAGS_icl = -Qstd=c99
+ARFLAGS_win64 = -out:$@
+
+# arch=mingw build options
+ASFLAGS_mingw = -f win64
+ARFLAGS_mingw = cr $@
+
+LDFLAGS_so = -Wl,-soname,$(soname)
+
+ifeq ($(arch),mingw)
+ CC=x86_64-w64-mingw32-gcc
+ AR=x86_64-w64-mingw32-ar
+ LDFLAGS += -Wl,--force-exe-suffix
+ SIM=wine
+ EXT=.exe
+ CLEANFILES+=*.exe
+endif
+
+ASFLAGS_Darwin = -f macho64 --prefix=_
+ARFLAGS_Darwin = -r $@
+ifeq ($(shell uname),Darwin)
+ LDFLAGS_so =
+ STRIP_gcc =
+endif
+
+# arch=aarch64 build options
+ifeq ($(lib_debug),1)
+ ASFLAGS_aarch64 = -g -c
+else
+ ASFLAGS_aarch64 = -c
+endif
+
+ARFLAGS_aarch64 = cr $@
+ifeq ($(arch),aarch64)
+ AS=$(CC) -D__ASSEMBLY__
+ SIM=
+endif
+# arch=noarch build options
+ARFLAGS_noarch = cr $@
+CFLAGS_noarch= -DNOARCH
+ifeq ($(arch),noarch)
+ host_cpu=base_aliases
+endif
+INCLUDE = $(patsubst %,-I%/,$(subst :, ,$(VPATH)))
+CFLAGS = $(CFLAGS_$(arch)) $(CFLAGS_$(CC)) $(DEBUG) -O2 $(DEFINES) $(INCLUDE)
+ASFLAGS = $(ASFLAGS_$(arch)) $(ASFLAGS_$(CC)) $(DEBUG_$(AS)) $(DEFINES) $(INCLUDE)
+ARFLAGS = $(ARFLAGS_$(arch))
+DEFINES += $(addprefix -D , $D)
+CLEANFILES += $(O) *.o *.a $(all_tests) $(lib_name) $(so_lib_name)
+
+ifeq ($(filter aarch64 x86_%,$(host_cpu)),)
+ host_cpu=base_aliases
+endif
+lsrc += $(lsrc_$(host_cpu))
+O = bin
+lobj += $(patsubst %.c,%.o,$(patsubst %.S,%.o,$(patsubst %.asm,%.o,$(lsrc) $(lsrc_intrinsic))))
+objs = $(addprefix $(O)/,$(notdir $(lobj)))
+
+
+lib_name ?= isa-l_crypto.a
+default: lib slib
+
+# Defaults for windows build
+ifeq ($(arch),win64)
+ AR=lib
+ CC=cl
+ OUTPUT_OPTION = -Fo$@
+ DEBUG=
+ lib_name := $(basename $(lib_name)).lib
+endif
+lsrcwin64 = $(lsrc)
+unit_testswin64 = $(unit_tests)
+exampleswin64 = $(examples)
+perf_testswin64 = $(perf_tests)
+
+
+# Build and run unit tests, performance tests, etc.
+all_tests = $(notdir $(sort $(perf_tests) $(check_tests) $(unit_tests) $(examples) $(other_tests)))
+all_unit_tests = $(notdir $(sort $(check_tests) $(unit_tests)))
+all_perf_tests = $(notdir $(sort $(perf_tests)))
+all_check_tests = $(notdir $(sort $(check_tests)))
+
+$(all_unit_tests): % : %.c $(lib_name)
+$(all_perf_tests): % : %.c $(lib_name)
+$(sort $(notdir $(examples))): % : %.c $(lib_name)
+$(sort $(notdir $(other_tests))): % : %.c $(lib_name)
+
+# Check for modern as
+test-as = $(shell hash printf && printf $(3) > $(2) && $(AS) $(ASFLAGS) ${tmpf} -o /dev/null 2> /dev/null && echo $(1) || echo $(4))
+as_4 := "pblendvb xmm2, xmm1;"
+as_6 := "vinserti32x8 zmm0, ymm1, 1;"
+as_10 := "vpcompressb zmm0 {k1}, zmm1;"
+
+tmpf := $(shell mktemp)
+as_feature_level := $(call test-as, 4, $(tmpf), $(as_4), $(as_feature_level))
+as_feature_level := $(call test-as, 6, $(tmpf), $(as_6), $(as_feature_level))
+as_feature_level := $(call test-as, 10, $(tmpf), $(as_10), $(as_feature_level))
+tmpf := $(shell rm ${tmpf})
+
+ifneq ($(findstring $(as_feature_level),6 10),)
+ D_HAVE_AS_KNOWS_AVX512_y := -DHAVE_AS_KNOWS_AVX512
+endif
+
+CFLAGS += -DAS_FEATURE_LEVEL=$(as_feature_level) $(D_HAVE_AS_KNOWS_AVX512_y)
+ASFLAGS += -DAS_FEATURE_LEVEL=$(as_feature_level) $(D_HAVE_AS_KNOWS_AVX512_y)
+
+sim test trace: $(addsuffix .run,$(all_unit_tests))
+perf: $(addsuffix .run,$(all_perf_tests))
+check: $(addsuffix .run,$(all_check_tests))
+ex: $(notdir $(examples))
+all: lib $(all_tests)
+other: $(notdir $(other_tests))
+tests: $(all_unit_tests)
+perfs: $(all_perf_tests)
+checks: $(all_check_tests)
+trace: SIM=sde -debugtrace --
+sim: SIM=sde --
+check test sim:
+ @echo Finished running $@
+
+$(objs): | $(O)
+$(O): ; mkdir -p $(O)
+
+# Build rule to run tests
+$(addsuffix .run,$(all_tests)): %.run : %
+ $(SIM) ./$<$(EXT)
+ @echo Completed run: $<
+
+# Other build rules
+msg = $(if $(DEBUG),DEBUG) $(patsubst 32,32-bit,$(host_cpu)) $D
+
+# gcc assembly files
+$(O)/%.o: $(host_cpu)/%.S
+ @echo " ---> Building $< $(msg)"
+ @$(AS) $(ASFLAGS) -o $@ $<
+
+$(O)/%.o : $(host_cpu)/%.c
+ @echo " ---> Building $< $(msg)"
+ @$(COMPILE.c) $(OUTPUT_OPTION) $<
+# yasm/nasm assembly files
+$(O)/%.o: %.asm
+ @echo " ---> Building $< $(msg)"
+ @$(AS) $(ASFLAGS) -o $@ $<
+
+
+$(O)/%.o %.o: %.c
+ @echo " ---> Building $< $(msg)"
+ @$(COMPILE.c) $(OUTPUT_OPTION) $<
+
+$(all_tests):
+ @echo " ---> Building Test $@ $(msg)"
+ @$(LINK.o) $(CFLAGS) $^ $(LDLIBS) -o $@
+
+
+# Target to build lib files
+lib: $(lib_name)
+ifneq ($(lib_debug),1)
+ $(lib_name): DEBUG_$(AS)= # Don't put debug symbols in the lib
+ $(lib_name): DEBUG=
+ $(lib_name): DEFINES+=-D NDEBUG
+endif
+ifeq ($(lib_debug),1)
+ DEBUG+=-D DEBUG # Define DEBUG for macros
+endif
+
+#lib $(lib_name): $(lib_name)(${objs})
+$(lib_name): $(objs)
+ @echo " ---> Creating Lib $@"
+ @$(AR) $(ARFLAGS) $^
+ifneq ($(lib_debug),1)
+ @$(STRIP_$(CC))
+endif
+
+
+# Target for shared lib
+so_lib_name = bin/libisal_crypto.so
+so_lib_inst = $(notdir $(so_lib_name))
+so_lib_ver = $(so_lib_inst).$(version)
+soname = $(so_lib_inst).$(word 1, $(subst ., ,$(version)))
+
+slib: $(so_lib_name)
+aobjs += $(addprefix $(O)/,$(patsubst %.asm,%.o,$(filter %.asm,$(notdir $(lsrc) $(lsrc_intrinsic)))))
+aobjs += $(addprefix $(O)/,$(patsubst %.S,%.o,$(filter %.S,$(notdir $(lsrc) $(lsrc_intrinsic)))))
+shared_objs += $(addprefix $(O)/shared_ver_,$(patsubst %.c,%.o,$(filter %.c,$(notdir $(lsrc) $(lsrc_intrinsic)))))
+
+$(O)/shared_ver_%.o: %.c
+ @echo " ---> Building shared $< $(msg)"
+ @$(COMPILE.c) $(OUTPUT_OPTION) $<
+
+$(O)/shared_ver_%.o: $(host_cpu)/%.c
+ @echo " ---> Building shared $< $(msg)"
+ @$(COMPILE.c) $(OUTPUT_OPTION) $<
+ifneq ($(lib_debug),1)
+ $(so_lib_name): DEBUG_$(AS)=
+ $(so_lib_name): DEBUG=
+ $(so_lib_name): DEFINES+=-D NDEBUG
+endif
+
+$(shared_objs): CFLAGS += -fPIC
+$(shared_objs) $(aobjs): | $(O)
+$(so_lib_name): LDFLAGS+=$(LDFLAGS_so)
+$(so_lib_name): $(shared_objs) $(aobjs)
+ @echo " ---> Creating Shared Lib $@"
+ @$(CC) $(CFLAGS) --shared $(LDFLAGS) -o $@ $^
+ @(cd $(@D); ln -f -s $(so_lib_inst) $(soname))
+
+isa-l_crypto.h:
+ @echo 'Building $@'
+ @echo '' >> $@
+ @echo '/**' >> $@
+ @echo ' * @file isa-l_crypto.h'>> $@
+ @echo ' * @brief Include for ISA-L_crypto library' >> $@
+ @echo ' */' >> $@
+ @echo '' >> $@
+ @echo '#ifndef _ISAL_CRYPTO_H_' >> $@
+ @echo '#define _ISAL_CRYPTO_H_' >> $@
+ @echo '' >> $@
+ @echo '#define.ISAL_CRYPTO_MAJOR_VERSION.${version}' | ${AWK} -F . '{print $$1, $$2, $$3}' >> $@
+ @echo '#define.ISAL_CRYPTO_MINOR_VERSION.${version}' | ${AWK} -F . '{print $$1, $$2, $$4}' >> $@
+ @echo '#define.ISAL_CRYPTO_PATCH_VERSION.${version}' | ${AWK} -F . '{print $$1, $$2, $$5}' >> $@
+ @echo '#define ISAL_CRYPTO_MAKE_VERSION(maj, min, patch) ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@
+ @echo '#define ISAL_CRYPTO_VERSION ISAL_CRYPTO_MAKE_VERSION(ISAL_CRYPTO_MAJOR_VERSION, ISAL_CRYPTO_MINOR_VERSION, ISAL_CRYPTO_PATCH_VERSION)' >> $@
+ @echo '' >> $@
+ @for unit in $(sort $(extern_hdrs)); do echo "#include <isa-l_crypto/$$unit>" | sed -e 's;include/;;' >> $@; done
+ @echo '#endif //_ISAL_CRYPTO_H_' >> $@
+
+
+# Target for install
+prefix = /usr/local
+install_dirs = $(prefix)/lib $(prefix)/include/isa-l_crypto
+$(install_dirs): ; mkdir -p $@
+install: $(sort $(extern_hdrs)) | $(install_dirs) $(lib_name) $(so_lib_name) isa-l_crypto.h
+ install -m 644 $(lib_name) $(prefix)/lib/libisal_crypto.a
+ install -m 644 $^ $(prefix)/include/isa-l_crypto/.
+ install -m 664 isa-l_crypto.h $(prefix)/include/.
+ install -m 664 include/types.h $(prefix)/include/isa-l_crypto/.
+ install -m 664 include/endian_helper.h $(prefix)/include/isa-l_crypto/.
+ install -m 664 $(so_lib_name) $(prefix)/lib/$(so_lib_ver)
+ (cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(soname) && ln -f -s $(so_lib_ver) $(so_lib_inst))
+ifeq ($(shell uname),Darwin)
+ (cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(basename $(so_lib_inst)).dylib)
+ which glibtool && glibtool --mode=finish $(prefix)/lib
+else
+ which libtool && libtool --mode=finish $(prefix)/lib || \
+ echo 'Lib installed at $(prefix)/lib. Run system-dependent programs to add shared lib path.'
+endif
+
+uninstall:
+ $(RM) $(prefix)/lib/libisal_crypto.a
+ $(RM) $(prefix)/lib/$(soname)
+ $(RM) $(prefix)/lib/$(so_lib_ver)
+ $(RM) $(prefix)/lib/$(so_lib_inst)
+ $(RM) -r $(prefix)/include/isa-l_crypto
+ $(RM) $(prefix)/include/isa-l_crypto.h
+ $(RM) $(prefix)/lib/$(basename $(so_lib_inst)).dylib
+
+# Collect performance data
+rpt_name = perf_report_$(shell uname -n)_$(shell date +%y%m%d).perf
+
+perf_report:
+ echo Results for $(rpt_name) >> $(rpt_name)
+ $(MAKE) -f Makefile.unx -k perf | tee -a $(rpt_name)
+ @echo Summary:
+ -grep runtime $(rpt_name)
+
+
+clean:
+ @echo Cleaning up
+ @$(RM) -r $(CLEANFILES)
+
+doc: isa-l_crypto.h
+ (cat Doxyfile; echo 'PROJECT_NUMBER=$(version)') | doxygen -
+ $(MAKE) -C generated_doc/latex &> generated_doc/latex_build_api.log
+ cp generated_doc/latex/refman.pdf isa-l_crypto_api_$(version).pdf
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am
new file mode 100644
index 000000000..423f12945
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am
@@ -0,0 +1,98 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += md5_mb/md5_ctx_sse.c \
+ md5_mb/md5_ctx_avx.c \
+ md5_mb/md5_ctx_avx2.c \
+ md5_mb/md5_ctx_base.c
+
+lsrc_x86_64 += md5_mb/md5_mb_mgr_init_sse.c \
+ md5_mb/md5_mb_mgr_init_avx2.c \
+ md5_mb/md5_mb_mgr_init_avx512.c
+
+lsrc_x86_64 += md5_mb/md5_mb_mgr_submit_sse.asm \
+ md5_mb/md5_mb_mgr_submit_avx.asm \
+ md5_mb/md5_mb_mgr_submit_avx2.asm \
+ md5_mb/md5_mb_mgr_flush_sse.asm \
+ md5_mb/md5_mb_mgr_flush_avx.asm \
+ md5_mb/md5_mb_mgr_flush_avx2.asm \
+ md5_mb/md5_mb_x4x2_sse.asm \
+ md5_mb/md5_mb_x4x2_avx.asm \
+ md5_mb/md5_mb_x8x2_avx2.asm \
+ md5_mb/md5_multibinary.asm
+
+lsrc_x86_64 += md5_mb/md5_mb_mgr_submit_avx512.asm \
+ md5_mb/md5_mb_mgr_flush_avx512.asm \
+ md5_mb/md5_mb_x16x2_avx512.asm \
+ md5_mb/md5_ctx_avx512.c
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += md5_mb/md5_ctx_base.c \
+ md5_mb/aarch64/md5_ctx_aarch64_asimd.c \
+ md5_mb/aarch64/md5_mb_aarch64_dispatcher.c \
+ md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c \
+ md5_mb/aarch64/md5_mb_asimd_x4.S \
+ md5_mb/aarch64/md5_mb_asimd_x1.S \
+ md5_mb/aarch64/md5_mb_multibinary.S
+
+
+lsrc_base_aliases += md5_mb/md5_ctx_base.c \
+ md5_mb/md5_ctx_base_aliases.c
+src_include += -I $(srcdir)/md5_mb
+extern_hdrs += include/md5_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ md5_mb/md5_job.asm \
+ md5_mb/md5_mb_mgr_datastruct.asm \
+ md5_mb/md5_ref.c \
+ include/reg_sizes.asm \
+ include/multibinary.asm \
+ include/memcpy_inline.h \
+ include/intrinreg.h
+
+check_tests += md5_mb/md5_mb_test \
+ md5_mb/md5_mb_rand_test \
+ md5_mb/md5_mb_rand_update_test
+
+unit_tests += md5_mb/md5_mb_rand_ssl_test
+
+perf_tests += md5_mb/md5_mb_vs_ossl_perf
+
+
+md5_mb_rand_test: md5_ref.o
+md5_mb_md5_mb_rand_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la
+md5_mb_rand_update_test: md5_ref.o
+md5_mb_md5_mb_rand_update_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la
+md5_mb_rand_ssl_test: LDLIBS += -lcrypto
+md5_mb_md5_mb_rand_ssl_test_LDFLAGS = -lcrypto
+md5_mb_vs_ossl_perf: LDLIBS += -lcrypto
+md5_mb_md5_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c
new file mode 100644
index 000000000..e9a708c17
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_ctx_aarch64_asimd.c
@@ -0,0 +1,230 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdlib.h>
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+void md5_mb_mgr_init_asimd(MD5_MB_JOB_MGR * state);
+MD5_JOB *md5_mb_mgr_submit_asimd(MD5_MB_JOB_MGR * state, MD5_JOB * job);
+MD5_JOB *md5_mb_mgr_flush_asimd(MD5_MB_JOB_MGR * state);
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_asimd(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_asimd(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_asimd(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_asimd(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_asimd(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_asimd(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..14ef3a6e6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_aarch64_dispatcher.c
@@ -0,0 +1,59 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(md5_ctx_mgr_submit)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(md5_ctx_mgr_submit_asimd);
+
+ return PROVIDER_BASIC(md5_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(md5_ctx_mgr_init)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(md5_ctx_mgr_init_asimd);
+
+ return PROVIDER_BASIC(md5_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(md5_ctx_mgr_flush)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(md5_ctx_mgr_flush_asimd);
+
+ return PROVIDER_BASIC(md5_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S
new file mode 100644
index 000000000..27d112494
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x1.S
@@ -0,0 +1,248 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ q_\name .req q\reg
+ v_\name .req v\reg
+ s_\name .req s\reg
+.endm
+
+
+.macro round_0_15 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+ eor tmp0,\d_c,\d_d
+ mov k,\kl
+ and tmp0,tmp0,\d_b
+ movk k,\kh,lsl 16
+ eor tmp0,tmp0,\d_d
+ add tmp1,k,\w
+ add tmp0,tmp1,tmp0
+ add tmp0,\d_a,tmp0
+ ror tmp0,tmp0,32 - \r
+ add \d_a,\d_b,tmp0
+.endm
+
+.macro round_16_31 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+ eor tmp0,\d_b,\d_c
+ mov k,\kl
+ and tmp0,tmp0,\d_d
+ movk k,\kh,lsl 16
+ eor tmp0,tmp0,\d_c
+ add tmp1,k,\w
+ add tmp0,tmp1,tmp0
+ add tmp0,\d_a,tmp0
+ ror tmp0,tmp0,32 - \r
+ add \d_a,\d_b,tmp0
+.endm
+
+.macro round_32_47 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+ eor tmp0,\d_b,\d_c
+ mov k,\kl
+ eor tmp0,tmp0,\d_d
+ movk k,\kh,lsl 16
+ add tmp1,k,\w
+ add tmp0,tmp1,tmp0
+ add tmp0,\d_a,tmp0
+ ror tmp0,tmp0,32 - \r
+ add \d_a,\d_b,tmp0
+.endm
+
+.macro round_48_63 d_a:req,d_b:req,d_c:req,d_d:req,kh:req,kl:req,w:req,r:req
+ orn tmp0,\d_b,\d_d
+ mov k,\kl
+ eor tmp0,tmp0,\d_c
+ movk k,\kh,lsl 16
+ add tmp1,k,\w
+ add tmp0,tmp1,tmp0
+ add tmp0,\d_a,tmp0
+ ror tmp0,tmp0,32 - \r
+ add \d_a,\d_b,tmp0
+.endm
+/*
+ variables
+*/
+ job0 .req x0
+ digest_addr .req x0
+ len .req w1
+ end .req x1
+
+ buf_adr .req x2
+ d_a .req w3
+ d_b .req w4
+ d_c .req w5
+ d_d .req w6
+ k .req w7
+ m0 .req w8
+ m1 .req w9
+ m2 .req w10
+ m3 .req w11
+ m4 .req w12
+ m5 .req w13
+ m6 .req w14
+ m7 .req w15
+ m8 .req w19
+ m9 .req w20
+ m10 .req w21
+ m11 .req w22
+ m12 .req w23
+ m13 .req w24
+ m14 .req w25
+ m15 .req w26
+
+ tmp0 .req w27
+ tmp1 .req w28
+
+ d_a1 .req w8
+ d_b1 .req w9
+ d_c1 .req w15
+ d_d1 .req w19
+
+/*
+ void md5_mb_asimd_x1(MD5_JOB * job0,int len)
+*/
+ .global md5_mb_asimd_x1
+ .type md5_mb_asimd_x1, %function
+md5_mb_asimd_x1:
+ cmp len,0
+ stp x29, x30, [sp,-96]!
+ ldr buf_adr,[job0],64
+ stp x19, x20, [sp, 16]
+ add end,buf_adr,end,lsl 6
+ stp x21, x22, [sp, 32]
+ ldp d_a,d_b,[digest_addr]
+ stp x23, x24, [sp, 48]
+ ldp d_c,d_d,[digest_addr,8]
+ stp x25, x26, [sp, 64]
+ stp x27, x28, [sp, 80]
+ ble .exit
+
+.loop_start:
+ ldp m0,m1,[buf_adr],8
+ ldp m2,m3,[buf_adr],8
+ round_0_15 d_a,d_b,d_c,d_d,0xd76a,0xa478,m0,7
+
+ ldp m4,m5,[buf_adr],8
+ round_0_15 d_d,d_a,d_b,d_c,0xe8c7,0xb756,m1,12
+ ldp m6,m7,[buf_adr],8
+ round_0_15 d_c,d_d,d_a,d_b,0x2420,0x70db,m2,17
+ ldp m8,m9,[buf_adr],8
+ round_0_15 d_b,d_c,d_d,d_a,0xc1bd,0xceee,m3,22
+ ldp m10,m11,[buf_adr],8
+ round_0_15 d_a,d_b,d_c,d_d,0xf57c,0xfaf,m4,7
+ ldp m12,m13,[buf_adr],8
+ round_0_15 d_d,d_a,d_b,d_c,0x4787,0xc62a,m5,12
+ ldp m14,m15,[buf_adr],8
+ round_0_15 d_c,d_d,d_a,d_b,0xa830,0x4613,m6,17
+ round_0_15 d_b,d_c,d_d,d_a,0xfd46,0x9501,m7,22
+ round_0_15 d_a,d_b,d_c,d_d,0x6980,0x98d8,m8,7
+ round_0_15 d_d,d_a,d_b,d_c,0x8b44,0xf7af,m9,12
+ round_0_15 d_c,d_d,d_a,d_b,0xffff,0x5bb1,m10,17
+ round_0_15 d_b,d_c,d_d,d_a,0x895c,0xd7be,m11,22
+ round_0_15 d_a,d_b,d_c,d_d,0x6b90,0x1122,m12,7
+ round_0_15 d_d,d_a,d_b,d_c,0xfd98,0x7193,m13,12
+ round_0_15 d_c,d_d,d_a,d_b,0xa679,0x438e,m14,17
+ round_0_15 d_b,d_c,d_d,d_a,0x49b4,0x821,m15,22
+
+ round_16_31 d_a,d_b,d_c,d_d,0xf61e,0x2562,m1,5
+ round_16_31 d_d,d_a,d_b,d_c,0xc040,0xb340,m6,9
+ round_16_31 d_c,d_d,d_a,d_b,0x265e,0x5a51,m11,14
+ round_16_31 d_b,d_c,d_d,d_a,0xe9b6,0xc7aa,m0,20
+ round_16_31 d_a,d_b,d_c,d_d,0xd62f,0x105d,m5,5
+ round_16_31 d_d,d_a,d_b,d_c,0x244,0x1453,m10,9
+ round_16_31 d_c,d_d,d_a,d_b,0xd8a1,0xe681,m15,14
+ round_16_31 d_b,d_c,d_d,d_a,0xe7d3,0xfbc8,m4,20
+ round_16_31 d_a,d_b,d_c,d_d,0x21e1,0xcde6,m9,5
+ round_16_31 d_d,d_a,d_b,d_c,0xc337,0x7d6,m14,9
+ round_16_31 d_c,d_d,d_a,d_b,0xf4d5,0xd87,m3,14
+ round_16_31 d_b,d_c,d_d,d_a,0x455a,0x14ed,m8,20
+ round_16_31 d_a,d_b,d_c,d_d,0xa9e3,0xe905,m13,5
+ round_16_31 d_d,d_a,d_b,d_c,0xfcef,0xa3f8,m2,9
+ round_16_31 d_c,d_d,d_a,d_b,0x676f,0x2d9,m7,14
+ round_16_31 d_b,d_c,d_d,d_a,0x8d2a,0x4c8a,m12,20
+
+ round_32_47 d_a,d_b,d_c,d_d,0xfffa,0x3942,m5,4
+ round_32_47 d_d,d_a,d_b,d_c,0x8771,0xf681,m8,11
+ round_32_47 d_c,d_d,d_a,d_b,0x6d9d,0x6122,m11,16
+ round_32_47 d_b,d_c,d_d,d_a,0xfde5,0x380c,m14,23
+ round_32_47 d_a,d_b,d_c,d_d,0xa4be,0xea44,m1,4
+ round_32_47 d_d,d_a,d_b,d_c,0x4bde,0xcfa9,m4,11
+ round_32_47 d_c,d_d,d_a,d_b,0xf6bb,0x4b60,m7,16
+ round_32_47 d_b,d_c,d_d,d_a,0xbebf,0xbc70,m10,23
+ round_32_47 d_a,d_b,d_c,d_d,0x289b,0x7ec6,m13,4
+ round_32_47 d_d,d_a,d_b,d_c,0xeaa1,0x27fa,m0,11
+ round_32_47 d_c,d_d,d_a,d_b,0xd4ef,0x3085,m3,16
+ round_32_47 d_b,d_c,d_d,d_a,0x488,0x1d05,m6,23
+ round_32_47 d_a,d_b,d_c,d_d,0xd9d4,0xd039,m9,4
+ round_32_47 d_d,d_a,d_b,d_c,0xe6db,0x99e5,m12,11
+ round_32_47 d_c,d_d,d_a,d_b,0x1fa2,0x7cf8,m15,16
+ round_32_47 d_b,d_c,d_d,d_a,0xc4ac,0x5665,m2,23
+
+ round_48_63 d_a,d_b,d_c,d_d,0xf429,0x2244,m0,6
+ round_48_63 d_d,d_a,d_b,d_c,0x432a,0xff97,m7,10
+ round_48_63 d_c,d_d,d_a,d_b,0xab94,0x23a7,m14,15
+ round_48_63 d_b,d_c,d_d,d_a,0xfc93,0xa039,m5,21
+ round_48_63 d_a,d_b,d_c,d_d,0x655b,0x59c3,m12,6
+ round_48_63 d_d,d_a,d_b,d_c,0x8f0c,0xcc92,m3,10
+ round_48_63 d_c,d_d,d_a,d_b,0xffef,0xf47d,m10,15
+ round_48_63 d_b,d_c,d_d,d_a,0x8584,0x5dd1,m1,21
+ round_48_63 d_a,d_b,d_c,d_d,0x6fa8,0x7e4f,m8,6
+ round_48_63 d_d,d_a,d_b,d_c,0xfe2c,0xe6e0,m15,10
+ round_48_63 d_c,d_d,d_a,d_b,0xa301,0x4314,m6,15
+ round_48_63 d_b,d_c,d_d,d_a,0x4e08,0x11a1,m13,21
+ round_48_63 d_a,d_b,d_c,d_d,0xf753,0x7e82,m4,6
+ ldp d_a1,d_b1,[digest_addr]
+ round_48_63 d_d,d_a,d_b,d_c,0xbd3a,0xf235,m11,10
+ ldp d_c1,d_d1,[digest_addr,8]
+ round_48_63 d_c,d_d,d_a,d_b,0x2ad7,0xd2bb,m2,15
+ round_48_63 d_b,d_c,d_d,d_a,0xeb86,0xd391,m9,21
+
+ cmp buf_adr,end
+ add d_a,d_a1 ,d_a
+ str d_a,[digest_addr]
+ add d_b,d_b1 ,d_b
+ str d_b,[digest_addr,4]
+ add d_c,d_c1 ,d_c
+ str d_c,[digest_addr,8]
+ add d_d,d_d1 ,d_d
+ str d_d,[digest_addr,12]
+ bne .loop_start
+
+.exit:
+ ldp x19, x20, [sp, 16]
+ ldp x21, x22, [sp, 32]
+ ldp x23, x24, [sp, 48]
+ ldp x25, x26, [sp, 64]
+ ldp x27, x28, [sp, 80]
+ ldp x29, x30, [sp], 96
+ ret
+ .size md5_mb_asimd_x1, .-md5_mb_asimd_x1
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S
new file mode 100644
index 000000000..53979131d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_asimd_x4.S
@@ -0,0 +1,526 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ q_\name .req q\reg
+ v_\name .req v\reg
+ s_\name .req s\reg
+.endm
+
+.macro add_key_rol a:req,b:req,k:req,w:req,r:req
+ add v_tmp0.4s,v_\k\().4s,v_\w\().4s
+ add v_tmp1.4s,v_tmp1.4s,v_\a\().4s
+ add v_tmp1.4s,v_tmp1.4s,v_tmp0.4s
+ shl v_tmp0.4s,v_tmp1.4s,\r
+ ushr v_tmp1.4s,v_tmp1.4s,32-\r
+ orr v_tmp0.16b,v_tmp1.16b,v_tmp0.16b
+
+ add v_\a\().4s,v_\b\().4s,v_tmp0.4s
+.endm
+.macro round_0_15 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
+ mov v_tmp1.16b, v_\b\().16b
+ bsl v_tmp1.16b, v_\c\().16b, v_\d\().16b
+ ldr q_\k1,[key_adr],16
+ add_key_rol \a,\b,\k,\w,\r
+.endm
+
+.macro round_16_31 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
+ mov v_tmp1.16b, v_\d\().16b
+ bsl v_tmp1.16b, v_\b\().16b, v_\c\().16b
+ ldr q_\k1,[key_adr],16
+ add_key_rol \a,\b,\k,\w,\r
+.endm
+
+.macro round_32_47 a:req,b:req,c:req,d:req,k:req,k1:req,w:req,r:req
+ eor v_tmp1.16b,v_\b\().16b,v_\c\().16b
+ eor v_tmp1.16b,v_tmp1.16b,v_\d\().16b
+ ldr q_\k1,[key_adr],16
+ add_key_rol \a,\b,\k,\w,\r
+.endm
+
+.macro round_48_63 a:req,b:req,c:req,d:req,k:req,k1,w:req,r:req
+ orn v_tmp1.16b,v_\b\().16b,v_\d\().16b
+ eor v_tmp1.16b,v_tmp1.16b,v_\c\().16b
+ .ifnb \k1
+ ldr q_\k1,[key_adr],16
+ .endif
+ add_key_rol \a,\b,\k,\w,\r
+.endm
+/*
+ variables
+*/
+ declare_var_vector_reg tmp0, 0
+ declare_var_vector_reg tmp1, 1
+ declare_var_vector_reg k, 2
+ declare_var_vector_reg k1, 3
+ declare_var_vector_reg a, 4
+ declare_var_vector_reg b, 5
+ declare_var_vector_reg c, 6
+ declare_var_vector_reg d, 7
+ declare_var_vector_reg a1, 8
+ declare_var_vector_reg b1, 9
+ declare_var_vector_reg c1, 10
+ declare_var_vector_reg d1, 11
+
+ declare_var_vector_reg w0, 16
+ declare_var_vector_reg w1, 17
+ declare_var_vector_reg w2, 18
+ declare_var_vector_reg w3, 19
+ declare_var_vector_reg w4, 20
+ declare_var_vector_reg w5, 21
+ declare_var_vector_reg w6, 22
+ declare_var_vector_reg w7, 23
+ declare_var_vector_reg w8, 24
+ declare_var_vector_reg w9, 25
+ declare_var_vector_reg w10, 26
+ declare_var_vector_reg w11, 27
+ declare_var_vector_reg w12, 28
+ declare_var_vector_reg w13, 29
+ declare_var_vector_reg w14, 30
+ declare_var_vector_reg w15, 31
+
+ len .req w4
+ len_x .req x4
+ lane0 .req x5
+ lane1 .req x6
+ lane2 .req x7
+ lane3 .req x9
+ end .req x4
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ job3 .req x3
+ key_adr .req x10
+
+/*
+ void md5_mb_asimd_x4(MD5_JOB * job0, MD5_JOB * job1,
+ MD5_JOB * job2, MD5_JOB * job3, int len)
+*/
+ .global md5_mb_asimd_x4
+ .type md5_mb_asimd_x4, %function
+md5_mb_asimd_x4:
+ stp x29,x30,[sp,-48]!
+ ldr lane0,[job0],64
+ stp d8,d9,[sp,16]
+ ldr lane1,[job1],64
+ stp d10,d11,[sp,32]
+ ldr lane2,[job2],64
+ cmp len,0
+ ldr lane3,[job3],64
+ ble .exit
+
+ //load digests
+ ld4 {v_a.s-v_d.s}[0],[job0]
+ add end,lane0,len_x,lsl 6
+ ld4 {v_a.s-v_d.s}[1],[job1]
+ ld4 {v_a.s-v_d.s}[2],[job2]
+ ld4 {v_a.s-v_d.s}[3],[job3]
+.loop_start:
+ ld1 {v_w0.s}[0],[lane0],4
+ mov v_a1.16b,v_a.16b
+ ld1 {v_w0.s}[1],[lane1],4
+ mov v_b1.16b,v_b.16b
+ ld1 {v_w0.s}[2],[lane2],4
+ mov v_c1.16b,v_c.16b
+ ld1 {v_w0.s}[3],[lane3],4
+ mov v_d1.16b,v_d.16b
+
+ ld3 {v_w1.s-v_w3.s}[0],[lane0],12
+ adrp key_adr,.key_consts
+ ld3 {v_w1.s-v_w3.s}[1],[lane1],12
+ add key_adr,key_adr,#:lo12:.key_consts
+ ld3 {v_w1.s-v_w3.s}[2],[lane2],12
+ ldr q_k,[key_adr],16
+ ld3 {v_w1.s-v_w3.s}[3],[lane3],12
+
+
+ ld4 {v_w4.s-v_w7.s}[0], [lane0],16
+
+ round_0_15 a,b,c,d,k,k1,w0,7
+
+ ld4 {v_w4.s-v_w7.s}[1], [lane1],16
+ round_0_15 d,a,b,c,k1,k,w1,12
+ ld4 {v_w4.s-v_w7.s}[2], [lane2],16
+ round_0_15 c,d,a,b,k,k1,w2,17
+ ld4 {v_w4.s-v_w7.s}[3], [lane3],16
+ round_0_15 b,c,d,a,k1,k,w3,22
+ ld4 {v_w8.s-v_w11.s}[0],[lane0],16
+ round_0_15 a,b,c,d,k,k1,w4,7
+ ld4 {v_w8.s-v_w11.s}[1],[lane1],16
+ round_0_15 d,a,b,c,k1,k,w5,12
+ ld4 {v_w8.s-v_w11.s}[2],[lane2],16
+ round_0_15 c,d,a,b,k,k1,w6,17
+ ld4 {v_w8.s-v_w11.s}[3],[lane3],16
+ round_0_15 b,c,d,a,k1,k,w7,22
+ ld4 {v_w12.s-v_w15.s}[0],[lane0],16
+ round_0_15 a,b,c,d,k,k1,w8,7
+ ld4 {v_w12.s-v_w15.s}[1],[lane1],16
+ round_0_15 d,a,b,c,k1,k,w9,12
+ ld4 {v_w12.s-v_w15.s}[2],[lane2],16
+ round_0_15 c,d,a,b,k,k1,w10,17
+ ld4 {v_w12.s-v_w15.s}[3],[lane3],16
+ round_0_15 b,c,d,a,k1,k,w11,22
+ round_0_15 a,b,c,d,k,k1,w12,7
+ round_0_15 d,a,b,c,k1,k,w13,12
+ round_0_15 c,d,a,b,k,k1,w14,17
+ round_0_15 b,c,d,a,k1,k,w15,22
+
+ round_16_31 a,b,c,d,k,k1,w1,5
+ round_16_31 d,a,b,c,k1,k,w6,9
+ round_16_31 c,d,a,b,k,k1,w11,14
+ round_16_31 b,c,d,a,k1,k,w0,20
+ round_16_31 a,b,c,d,k,k1,w5,5
+ round_16_31 d,a,b,c,k1,k,w10,9
+ round_16_31 c,d,a,b,k,k1,w15,14
+ round_16_31 b,c,d,a,k1,k,w4,20
+ round_16_31 a,b,c,d,k,k1,w9,5
+ round_16_31 d,a,b,c,k1,k,w14,9
+ round_16_31 c,d,a,b,k,k1,w3,14
+ round_16_31 b,c,d,a,k1,k,w8,20
+ round_16_31 a,b,c,d,k,k1,w13,5
+ round_16_31 d,a,b,c,k1,k,w2,9
+ round_16_31 c,d,a,b,k,k1,w7,14
+ round_16_31 b,c,d,a,k1,k,w12,20
+
+ round_32_47 a,b,c,d,k,k1,w5,4
+ round_32_47 d,a,b,c,k1,k,w8,11
+ round_32_47 c,d,a,b,k,k1,w11,16
+ round_32_47 b,c,d,a,k1,k,w14,23
+ round_32_47 a,b,c,d,k,k1,w1,4
+ round_32_47 d,a,b,c,k1,k,w4,11
+ round_32_47 c,d,a,b,k,k1,w7,16
+ round_32_47 b,c,d,a,k1,k,w10,23
+ round_32_47 a,b,c,d,k,k1,w13,4
+ round_32_47 d,a,b,c,k1,k,w0,11
+ round_32_47 c,d,a,b,k,k1,w3,16
+ round_32_47 b,c,d,a,k1,k,w6,23
+ round_32_47 a,b,c,d,k,k1,w9,4
+ round_32_47 d,a,b,c,k1,k,w12,11
+ round_32_47 c,d,a,b,k,k1,w15,16
+ round_32_47 b,c,d,a,k1,k,w2,23
+
+ round_48_63 a,b,c,d,k,k1,w0,6
+ round_48_63 d,a,b,c,k1,k,w7,10
+ round_48_63 c,d,a,b,k,k1,w14,15
+ round_48_63 b,c,d,a,k1,k,w5,21
+ round_48_63 a,b,c,d,k,k1,w12,6
+ round_48_63 d,a,b,c,k1,k,w3,10
+ round_48_63 c,d,a,b,k,k1,w10,15
+ round_48_63 b,c,d,a,k1,k,w1,21
+ round_48_63 a,b,c,d,k,k1,w8,6
+ round_48_63 d,a,b,c,k1,k,w15,10
+ round_48_63 c,d,a,b,k,k1,w6,15
+ round_48_63 b,c,d,a,k1,k,w13,21
+ round_48_63 a,b,c,d,k,k1,w4,6
+ round_48_63 d,a,b,c,k1,k,w11,10
+ round_48_63 c,d,a,b,k,k1,w2,15
+ round_48_63 b,c,d,a,k1, ,w9,21
+
+
+
+
+ cmp lane0,end
+ add v_a.4s,v_a1.4s,v_a.4s
+ add v_b.4s,v_b1.4s,v_b.4s
+ add v_c.4s,v_c1.4s,v_c.4s
+ add v_d.4s,v_d1.4s,v_d.4s
+ bne .loop_start
+
+ st4 {v_a.s-v_d.s}[0],[job0]
+ st4 {v_a.s-v_d.s}[1],[job1]
+ st4 {v_a.s-v_d.s}[2],[job2]
+ st4 {v_a.s-v_d.s}[3],[job3]
+.exit:
+ ldp d8,d9,[sp,16]
+ ldp d10,d11,[sp,32]
+ ldp x29,x30,[sp],48
+ ret
+.key_consts:
+ .word 0xd76aa478
+ .word 0xd76aa478
+ .word 0xd76aa478
+ .word 0xd76aa478
+ .word 0xe8c7b756
+ .word 0xe8c7b756
+ .word 0xe8c7b756
+ .word 0xe8c7b756
+ .word 0x242070db
+ .word 0x242070db
+ .word 0x242070db
+ .word 0x242070db
+ .word 0xc1bdceee
+ .word 0xc1bdceee
+ .word 0xc1bdceee
+ .word 0xc1bdceee
+ .word 0xf57c0faf
+ .word 0xf57c0faf
+ .word 0xf57c0faf
+ .word 0xf57c0faf
+ .word 0x4787c62a
+ .word 0x4787c62a
+ .word 0x4787c62a
+ .word 0x4787c62a
+ .word 0xa8304613
+ .word 0xa8304613
+ .word 0xa8304613
+ .word 0xa8304613
+ .word 0xfd469501
+ .word 0xfd469501
+ .word 0xfd469501
+ .word 0xfd469501
+ .word 0x698098d8
+ .word 0x698098d8
+ .word 0x698098d8
+ .word 0x698098d8
+ .word 0x8b44f7af
+ .word 0x8b44f7af
+ .word 0x8b44f7af
+ .word 0x8b44f7af
+ .word 0xffff5bb1
+ .word 0xffff5bb1
+ .word 0xffff5bb1
+ .word 0xffff5bb1
+ .word 0x895cd7be
+ .word 0x895cd7be
+ .word 0x895cd7be
+ .word 0x895cd7be
+ .word 0x6b901122
+ .word 0x6b901122
+ .word 0x6b901122
+ .word 0x6b901122
+ .word 0xfd987193
+ .word 0xfd987193
+ .word 0xfd987193
+ .word 0xfd987193
+ .word 0xa679438e
+ .word 0xa679438e
+ .word 0xa679438e
+ .word 0xa679438e
+ .word 0x49b40821
+ .word 0x49b40821
+ .word 0x49b40821
+ .word 0x49b40821
+ .word 0xf61e2562
+ .word 0xf61e2562
+ .word 0xf61e2562
+ .word 0xf61e2562
+ .word 0xc040b340
+ .word 0xc040b340
+ .word 0xc040b340
+ .word 0xc040b340
+ .word 0x265e5a51
+ .word 0x265e5a51
+ .word 0x265e5a51
+ .word 0x265e5a51
+ .word 0xe9b6c7aa
+ .word 0xe9b6c7aa
+ .word 0xe9b6c7aa
+ .word 0xe9b6c7aa
+ .word 0xd62f105d
+ .word 0xd62f105d
+ .word 0xd62f105d
+ .word 0xd62f105d
+ .word 0x02441453
+ .word 0x02441453
+ .word 0x02441453
+ .word 0x02441453
+ .word 0xd8a1e681
+ .word 0xd8a1e681
+ .word 0xd8a1e681
+ .word 0xd8a1e681
+ .word 0xe7d3fbc8
+ .word 0xe7d3fbc8
+ .word 0xe7d3fbc8
+ .word 0xe7d3fbc8
+ .word 0x21e1cde6
+ .word 0x21e1cde6
+ .word 0x21e1cde6
+ .word 0x21e1cde6
+ .word 0xc33707d6
+ .word 0xc33707d6
+ .word 0xc33707d6
+ .word 0xc33707d6
+ .word 0xf4d50d87
+ .word 0xf4d50d87
+ .word 0xf4d50d87
+ .word 0xf4d50d87
+ .word 0x455a14ed
+ .word 0x455a14ed
+ .word 0x455a14ed
+ .word 0x455a14ed
+ .word 0xa9e3e905
+ .word 0xa9e3e905
+ .word 0xa9e3e905
+ .word 0xa9e3e905
+ .word 0xfcefa3f8
+ .word 0xfcefa3f8
+ .word 0xfcefa3f8
+ .word 0xfcefa3f8
+ .word 0x676f02d9
+ .word 0x676f02d9
+ .word 0x676f02d9
+ .word 0x676f02d9
+ .word 0x8d2a4c8a
+ .word 0x8d2a4c8a
+ .word 0x8d2a4c8a
+ .word 0x8d2a4c8a
+ .word 0xfffa3942
+ .word 0xfffa3942
+ .word 0xfffa3942
+ .word 0xfffa3942
+ .word 0x8771f681
+ .word 0x8771f681
+ .word 0x8771f681
+ .word 0x8771f681
+ .word 0x6d9d6122
+ .word 0x6d9d6122
+ .word 0x6d9d6122
+ .word 0x6d9d6122
+ .word 0xfde5380c
+ .word 0xfde5380c
+ .word 0xfde5380c
+ .word 0xfde5380c
+ .word 0xa4beea44
+ .word 0xa4beea44
+ .word 0xa4beea44
+ .word 0xa4beea44
+ .word 0x4bdecfa9
+ .word 0x4bdecfa9
+ .word 0x4bdecfa9
+ .word 0x4bdecfa9
+ .word 0xf6bb4b60
+ .word 0xf6bb4b60
+ .word 0xf6bb4b60
+ .word 0xf6bb4b60
+ .word 0xbebfbc70
+ .word 0xbebfbc70
+ .word 0xbebfbc70
+ .word 0xbebfbc70
+ .word 0x289b7ec6
+ .word 0x289b7ec6
+ .word 0x289b7ec6
+ .word 0x289b7ec6
+ .word 0xeaa127fa
+ .word 0xeaa127fa
+ .word 0xeaa127fa
+ .word 0xeaa127fa
+ .word 0xd4ef3085
+ .word 0xd4ef3085
+ .word 0xd4ef3085
+ .word 0xd4ef3085
+ .word 0x04881d05
+ .word 0x04881d05
+ .word 0x04881d05
+ .word 0x04881d05
+ .word 0xd9d4d039
+ .word 0xd9d4d039
+ .word 0xd9d4d039
+ .word 0xd9d4d039
+ .word 0xe6db99e5
+ .word 0xe6db99e5
+ .word 0xe6db99e5
+ .word 0xe6db99e5
+ .word 0x1fa27cf8
+ .word 0x1fa27cf8
+ .word 0x1fa27cf8
+ .word 0x1fa27cf8
+ .word 0xc4ac5665
+ .word 0xc4ac5665
+ .word 0xc4ac5665
+ .word 0xc4ac5665
+ .word 0xf4292244
+ .word 0xf4292244
+ .word 0xf4292244
+ .word 0xf4292244
+ .word 0x432aff97
+ .word 0x432aff97
+ .word 0x432aff97
+ .word 0x432aff97
+ .word 0xab9423a7
+ .word 0xab9423a7
+ .word 0xab9423a7
+ .word 0xab9423a7
+ .word 0xfc93a039
+ .word 0xfc93a039
+ .word 0xfc93a039
+ .word 0xfc93a039
+ .word 0x655b59c3
+ .word 0x655b59c3
+ .word 0x655b59c3
+ .word 0x655b59c3
+ .word 0x8f0ccc92
+ .word 0x8f0ccc92
+ .word 0x8f0ccc92
+ .word 0x8f0ccc92
+ .word 0xffeff47d
+ .word 0xffeff47d
+ .word 0xffeff47d
+ .word 0xffeff47d
+ .word 0x85845dd1
+ .word 0x85845dd1
+ .word 0x85845dd1
+ .word 0x85845dd1
+ .word 0x6fa87e4f
+ .word 0x6fa87e4f
+ .word 0x6fa87e4f
+ .word 0x6fa87e4f
+ .word 0xfe2ce6e0
+ .word 0xfe2ce6e0
+ .word 0xfe2ce6e0
+ .word 0xfe2ce6e0
+ .word 0xa3014314
+ .word 0xa3014314
+ .word 0xa3014314
+ .word 0xa3014314
+ .word 0x4e0811a1
+ .word 0x4e0811a1
+ .word 0x4e0811a1
+ .word 0x4e0811a1
+ .word 0xf7537e82
+ .word 0xf7537e82
+ .word 0xf7537e82
+ .word 0xf7537e82
+ .word 0xbd3af235
+ .word 0xbd3af235
+ .word 0xbd3af235
+ .word 0xbd3af235
+ .word 0x2ad7d2bb
+ .word 0x2ad7d2bb
+ .word 0x2ad7d2bb
+ .word 0x2ad7d2bb
+ .word 0xeb86d391
+ .word 0xeb86d391
+ .word 0xeb86d391
+ .word 0xeb86d391
+ .size md5_mb_asimd_x4, .-md5_mb_asimd_x4
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c
new file mode 100644
index 000000000..5289cd91f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_mgr_aarch64_asimd.c
@@ -0,0 +1,187 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <md5_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define MD5_MB_CE_MAX_LANES 4
+void md5_mb_asimd_x4(MD5_JOB *, MD5_JOB *, MD5_JOB *, MD5_JOB *, int);
+void md5_mb_asimd_x1(MD5_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void md5_mb_mgr_init_asimd(MD5_MB_JOB_MGR * state)
+{
+ unsigned int i;
+
+ state->unused_lanes[0] = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = 0; i < MD5_MB_CE_MAX_LANES; i++) {
+ state->unused_lanes[0] <<= 4;
+ state->unused_lanes[0] |= MD5_MB_CE_MAX_LANES - 1 - i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > MD5_MB_CE_MAX_LANES is invalid lane
+ for (; i < MD5_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int md5_mb_mgr_do_jobs(MD5_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i;
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+ if (state->num_lanes_inuse == 4) {
+ len = min(min(state->lens[0], state->lens[1]),
+ min(state->lens[2], state->lens[3]));
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+ md5_mb_asimd_x4(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane,
+ state->ldata[3].job_in_lane, len >> 4);
+ //only return the min length job
+ for (i = 0; i < MD5_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+
+ return lane_idx;
+ } else {
+ for (i = 0; i < MD5_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ len = state->lens[i] & (~0xf);
+ md5_mb_asimd_x1(state->ldata[i].job_in_lane, len >> 4);
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ return i;
+ }
+ }
+ }
+ return -1;
+
+}
+
+static MD5_JOB *md5_mb_mgr_free_lane(MD5_MB_JOB_MGR * state)
+{
+ int i;
+ MD5_JOB *ret = NULL;
+
+ for (i = 0; i < MD5_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes[0] <<= 4;
+ state->unused_lanes[0] |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void md5_mb_mgr_insert_job(MD5_MB_JOB_MGR * state, MD5_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes[0] & 0xf;
+ //fatal error
+ assert(lane_idx < MD5_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes[0] >>= 4;
+ state->num_lanes_inuse++;
+}
+
+MD5_JOB *md5_mb_mgr_submit_asimd(MD5_MB_JOB_MGR * state, MD5_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ MD5_JOB *ret;
+
+ //add job into lanes
+ md5_mb_mgr_insert_job(state, job);
+
+ ret = md5_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < MD5_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = md5_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ md5_mb_mgr_do_jobs(state);
+#endif
+
+ ret = md5_mb_mgr_free_lane(state);
+ return ret;
+}
+
+MD5_JOB *md5_mb_mgr_flush_asimd(MD5_MB_JOB_MGR * state)
+{
+ MD5_JOB *ret;
+ ret = md5_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ md5_mb_mgr_do_jobs(state);
+ return md5_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S
new file mode 100644
index 000000000..b66320f5c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/aarch64/md5_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface md5_ctx_mgr_submit
+mbin_interface md5_ctx_mgr_init
+mbin_interface md5_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c
new file mode 100644
index 000000000..ac03a6705
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c
@@ -0,0 +1,263 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_avx(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx_slver_02020183;
+struct slver md5_ctx_mgr_init_avx_slver = { 0x0183, 0x02, 0x02 };
+
+struct slver md5_ctx_mgr_submit_avx_slver_02020184;
+struct slver md5_ctx_mgr_submit_avx_slver = { 0x0184, 0x02, 0x02 };
+
+struct slver md5_ctx_mgr_flush_avx_slver_02020185;
+struct slver md5_ctx_mgr_flush_avx_slver = { 0x0185, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c
new file mode 100644
index 000000000..cdc910c0d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c
@@ -0,0 +1,263 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx2(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx2(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx2(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx2_slver_04020186;
+struct slver md5_ctx_mgr_init_avx2_slver = { 0x0186, 0x02, 0x04 };
+
+struct slver md5_ctx_mgr_submit_avx2_slver_04020187;
+struct slver md5_ctx_mgr_submit_avx2_slver = { 0x0187, 0x02, 0x04 };
+
+struct slver md5_ctx_mgr_flush_avx2_slver_04020188;
+struct slver md5_ctx_mgr_flush_avx2_slver = { 0x0188, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c
new file mode 100644
index 000000000..682c2ed5e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c
@@ -0,0 +1,267 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx512(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx512(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx512(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx512_slver_0600018c;
+struct slver md5_ctx_mgr_init_avx512_slver = { 0x018c, 0x00, 0x06 };
+
+struct slver md5_ctx_mgr_submit_avx512_slver_0600018d;
+struct slver md5_ctx_mgr_submit_avx512_slver = { 0x018d, 0x00, 0x06 };
+
+struct slver md5_ctx_mgr_flush_avx512_slver_0600018e;
+struct slver md5_ctx_mgr_flush_avx512_slver = { 0x018e, 0x00, 0x06 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c
new file mode 100644
index 000000000..c1d2a2738
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base.c
@@ -0,0 +1,291 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (c ^ (d & (b ^ c)))
+#define F3(b,c,d) (b ^ c ^ d)
+#define F4(b,c,d) (c ^ (b | ~d))
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define step(i,a,b,c,d,f,k,w,r) \
+ if (i < 16) {f = F1(b,c,d); } else \
+ if (i < 32) {f = F2(b,c,d); } else \
+ if (i < 48) {f = F3(b,c,d); } else \
+ {f = F4(b,c,d); } \
+ f = a + f + k + to_le32(w); \
+ a = b + rol32(f, r);
+
+static void md5_init(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t md5_update(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void md5_final(MD5_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX md5_single(const void *data, uint32_t digest[4]);
+static inline void hash_init_digest(MD5_WORD_T * digest);
+
+void md5_ctx_mgr_init_base(MD5_HASH_CTX_MGR * mgr)
+{
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_base(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ uint32_t remain_len;
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+ // Cannot submit a new entire job to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags == HASH_FIRST) {
+
+ md5_init(ctx, buffer, len);
+ md5_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_UPDATE) {
+ md5_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_LAST) {
+ remain_len = md5_update(ctx, buffer, len);
+ md5_final(ctx, remain_len);
+ }
+
+ if (flags == HASH_ENTIRE) {
+ md5_init(ctx, buffer, len);
+ remain_len = md5_update(ctx, buffer, len);
+ md5_final(ctx, remain_len);
+ }
+
+ return ctx;
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_base(MD5_HASH_CTX_MGR * mgr)
+{
+ return NULL;
+}
+
+static void md5_init(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Mark it as processing
+ ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t md5_update(MD5_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ uint32_t remain_len = len;
+ uint32_t *digest = ctx->job.result_digest;
+ while (remain_len >= 64) {
+ md5_single(buffer, digest);
+ buffer = (void *)((uint8_t *) buffer + 64);
+ remain_len -= 64;
+ ctx->total_length += 64;
+ }
+
+ ctx->status = HASH_CTX_STS_IDLE;
+ ctx->incoming_buffer = buffer;
+ return remain_len;
+}
+
+static void md5_final(MD5_HASH_CTX * ctx, uint32_t remain_len)
+{
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t i = remain_len, j;
+ uint8_t buf[128];
+ uint32_t *digest = ctx->job.result_digest;
+
+ ctx->total_length += i;
+ memcpy(buf, buffer, i);
+ buf[i++] = 0x80;
+ for (j = i; j < 120; j++)
+ buf[j] = 0;
+
+ if (i > 64 - 8)
+ i = 128;
+ else
+ i = 64;
+
+ *(uint64_t *) (buf + i - 8) = to_le64((uint64_t) ctx->total_length * 8);
+
+ md5_single(buf, digest);
+ if (i == 128) {
+ md5_single(buf + 64, digest);
+ }
+
+ ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+static void md5_single(const void *data, uint32_t digest[4])
+{
+
+ uint32_t a, b, c, d;
+ uint32_t f;
+ uint32_t *w = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+
+ step(0, a, b, c, d, f, 0xd76aa478, w[0], 7);
+ step(1, d, a, b, c, f, 0xe8c7b756, w[1], 12);
+ step(2, c, d, a, b, f, 0x242070db, w[2], 17);
+ step(3, b, c, d, a, f, 0xc1bdceee, w[3], 22);
+ step(4, a, b, c, d, f, 0xf57c0faf, w[4], 7);
+ step(5, d, a, b, c, f, 0x4787c62a, w[5], 12);
+ step(6, c, d, a, b, f, 0xa8304613, w[6], 17);
+ step(7, b, c, d, a, f, 0xfd469501, w[7], 22);
+ step(8, a, b, c, d, f, 0x698098d8, w[8], 7);
+ step(9, d, a, b, c, f, 0x8b44f7af, w[9], 12);
+ step(10, c, d, a, b, f, 0xffff5bb1, w[10], 17);
+ step(11, b, c, d, a, f, 0x895cd7be, w[11], 22);
+ step(12, a, b, c, d, f, 0x6b901122, w[12], 7);
+ step(13, d, a, b, c, f, 0xfd987193, w[13], 12);
+ step(14, c, d, a, b, f, 0xa679438e, w[14], 17);
+ step(15, b, c, d, a, f, 0x49b40821, w[15], 22);
+
+ step(16, a, b, c, d, f, 0xf61e2562, w[1], 5);
+ step(17, d, a, b, c, f, 0xc040b340, w[6], 9);
+ step(18, c, d, a, b, f, 0x265e5a51, w[11], 14);
+ step(19, b, c, d, a, f, 0xe9b6c7aa, w[0], 20);
+ step(20, a, b, c, d, f, 0xd62f105d, w[5], 5);
+ step(21, d, a, b, c, f, 0x02441453, w[10], 9);
+ step(22, c, d, a, b, f, 0xd8a1e681, w[15], 14);
+ step(23, b, c, d, a, f, 0xe7d3fbc8, w[4], 20);
+ step(24, a, b, c, d, f, 0x21e1cde6, w[9], 5);
+ step(25, d, a, b, c, f, 0xc33707d6, w[14], 9);
+ step(26, c, d, a, b, f, 0xf4d50d87, w[3], 14);
+ step(27, b, c, d, a, f, 0x455a14ed, w[8], 20);
+ step(28, a, b, c, d, f, 0xa9e3e905, w[13], 5);
+ step(29, d, a, b, c, f, 0xfcefa3f8, w[2], 9);
+ step(30, c, d, a, b, f, 0x676f02d9, w[7], 14);
+ step(31, b, c, d, a, f, 0x8d2a4c8a, w[12], 20);
+
+ step(32, a, b, c, d, f, 0xfffa3942, w[5], 4);
+ step(33, d, a, b, c, f, 0x8771f681, w[8], 11);
+ step(34, c, d, a, b, f, 0x6d9d6122, w[11], 16);
+ step(35, b, c, d, a, f, 0xfde5380c, w[14], 23);
+ step(36, a, b, c, d, f, 0xa4beea44, w[1], 4);
+ step(37, d, a, b, c, f, 0x4bdecfa9, w[4], 11);
+ step(38, c, d, a, b, f, 0xf6bb4b60, w[7], 16);
+ step(39, b, c, d, a, f, 0xbebfbc70, w[10], 23);
+ step(40, a, b, c, d, f, 0x289b7ec6, w[13], 4);
+ step(41, d, a, b, c, f, 0xeaa127fa, w[0], 11);
+ step(42, c, d, a, b, f, 0xd4ef3085, w[3], 16);
+ step(43, b, c, d, a, f, 0x04881d05, w[6], 23);
+ step(44, a, b, c, d, f, 0xd9d4d039, w[9], 4);
+ step(45, d, a, b, c, f, 0xe6db99e5, w[12], 11);
+ step(46, c, d, a, b, f, 0x1fa27cf8, w[15], 16);
+ step(47, b, c, d, a, f, 0xc4ac5665, w[2], 23);
+
+ step(48, a, b, c, d, f, 0xf4292244, w[0], 6);
+ step(49, d, a, b, c, f, 0x432aff97, w[7], 10);
+ step(50, c, d, a, b, f, 0xab9423a7, w[14], 15);
+ step(51, b, c, d, a, f, 0xfc93a039, w[5], 21);
+ step(52, a, b, c, d, f, 0x655b59c3, w[12], 6);
+ step(53, d, a, b, c, f, 0x8f0ccc92, w[3], 10);
+ step(54, c, d, a, b, f, 0xffeff47d, w[10], 15);
+ step(55, b, c, d, a, f, 0x85845dd1, w[1], 21);
+ step(56, a, b, c, d, f, 0x6fa87e4f, w[8], 6);
+ step(57, d, a, b, c, f, 0xfe2ce6e0, w[15], 10);
+ step(58, c, d, a, b, f, 0xa3014314, w[6], 15);
+ step(59, b, c, d, a, f, 0x4e0811a1, w[13], 21);
+ step(60, a, b, c, d, f, 0xf7537e82, w[4], 6);
+ step(61, d, a, b, c, f, 0xbd3af235, w[11], 10);
+ step(62, c, d, a, b, f, 0x2ad7d2bb, w[2], 15);
+ step(63, b, c, d, a, f, 0xeb86d391, w[9], 21);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_base_slver_0000018f;
+struct slver md5_ctx_mgr_init_base_slver = { 0x018f, 0x00, 0x00 };
+
+struct slver md5_ctx_mgr_submit_base_slver_00000190;
+struct slver md5_ctx_mgr_submit_base_slver = { 0x0190, 0x00, 0x00 };
+
+struct slver md5_ctx_mgr_flush_base_slver_00000191;
+struct slver md5_ctx_mgr_flush_base_slver = { 0x0191, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c
new file mode 100644
index 000000000..42e29ab5a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_base_aliases.c
@@ -0,0 +1,50 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <string.h>
+#include "md5_mb.h"
+extern void md5_ctx_mgr_init_base(MD5_HASH_CTX_MGR * mgr);
+extern MD5_HASH_CTX *md5_ctx_mgr_flush_base(MD5_HASH_CTX_MGR * mgr);
+extern MD5_HASH_CTX *md5_ctx_mgr_submit_base(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags);
+void md5_ctx_mgr_init(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_ctx_mgr_init_base(mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush(MD5_HASH_CTX_MGR * mgr)
+{
+ return md5_ctx_mgr_flush_base(mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ return md5_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c
new file mode 100644
index 000000000..1e7e91916
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_sse(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_sse(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_sse(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_sse(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (MD5_BLOCK_SIZE - 1));
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_sse_slver_00020180;
+struct slver md5_ctx_mgr_init_sse_slver = { 0x0180, 0x02, 0x00 };
+
+struct slver md5_ctx_mgr_submit_sse_slver_00020181;
+struct slver md5_ctx_mgr_submit_sse_slver = { 0x0181, 0x02, 0x00 };
+
+struct slver md5_ctx_mgr_flush_sse_slver_00020182;
+struct slver md5_ctx_mgr_flush_sse_slver = { 0x0182, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm
new file mode 100644
index 000000000..7719946f0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm
@@ -0,0 +1,55 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define JOB_MD5 structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; JOB_MD5
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 4, 4 ; length in bytes
+FIELD _result_digest, 4*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+%assign _JOB_MD5_size _FIELD_OFFSET
+%assign _JOB_MD5_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..6caad6733
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm
@@ -0,0 +1,73 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define MD5 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MD5_ARGS_X32
+;;; name size align
+FIELD _digest, 4*4*32, 16 ; transposed digest
+FIELD _data_ptr, 8*32, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _MD5_ARGS_X8_size _FIELD_OFFSET
+%assign _MD5_ARGS_X8_align _STRUCT_ALIGN
+%assign _MD5_ARGS_X16_size _FIELD_OFFSET
+%assign _MD5_ARGS_X16_align _STRUCT_ALIGN
+%assign _MD5_ARGS_X32_size _FIELD_OFFSET
+%assign _MD5_ARGS_X32_align _STRUCT_ALIGN
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _MD5_ARGS_X8_size, _MD5_ARGS_X8_align
+FIELD _lens, 4*32, 8
+FIELD _unused_lanes, 8*4, 8
+FIELD _ldata, _LANE_DATA_size*32, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..b74646de4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm
@@ -0,0 +1,248 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_avx
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_avx
+%define idx r8
+
+%define unused_lanes r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_avx(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+mk_global md5_mb_mgr_flush_avx, function
+md5_mb_mgr_flush_avx:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; if bit (32+3) is set, then all lanes are empty
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..910d5af89
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm
@@ -0,0 +1,255 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x8x2_avx2
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x8x2_avx2
+%define idx rbp
+
+%define unused_lanes r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%define num_lanes_inuse r8
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_avx2(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+mk_global md5_mb_mgr_flush_avx2, function
+md5_mb_mgr_flush_avx2:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x8x2_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..a0eaf428a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm
@@ -0,0 +1,315 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern md5_mb_x16x2_avx512
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x16_avx512
+%define idx rbp
+
+%define unused_lanes ymm7
+%define lane r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%define num_lanes_inuse r8
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+;; Byte shift in MEM addr, read a extra byte [addr+16]
+%macro MEM_VPSRLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr + 1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr + 31], byte 0
+%endmacro
+
+;; Byte shift in MEM addr, read a extra byte [addr-1]
+%macro MEM_VPSLLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr-1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr], byte 0
+%endmacro
+
+align 64
+
+; JOB* md5_mb_mgr_flush_avx512(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+mk_global md5_mb_mgr_flush_avx512, function
+md5_mb_mgr_flush_avx512:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 31
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 32
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ ; Find min length
+ vmovdqu ymm5, [state + _lens + 2*32]
+ vmovdqu ymm6, [state + _lens + 3*32]
+
+ vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+ vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+
+ vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0x3F
+ shr len2, 6
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_6bits]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+ vpsubd ymm5, ymm5, ymm2
+ vpsubd ymm6, ymm6, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+ vmovdqu [state + _lens + 2*32], ymm5
+ vmovdqu [state + _lens + 3*32], ymm6
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x16x2_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov lane, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+
+ shl lane, 8
+ or lane, idx
+ MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes
+ mov [state + _unused_lanes], lane
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_6bits:
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+lane_16: dq 16
+lane_17: dq 17
+lane_18: dq 18
+lane_19: dq 19
+lane_20: dq 20
+lane_21: dq 21
+lane_22: dq 22
+lane_23: dq 23
+lane_24: dq 24
+lane_25: dq 25
+lane_26: dq 26
+lane_27: dq 27
+lane_28: dq 28
+lane_29: dq 29
+lane_30: dq 30
+lane_31: dq 31
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_mgr_flush_avx512
+no_md5_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..d3aa25f86
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm
@@ -0,0 +1,249 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_sse
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_sse
+%define idx r8
+
+%define unused_lanes r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_sse(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+mk_global md5_mb_mgr_flush_sse, function
+md5_mb_mgr_flush_sse:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; if bit (32+3) is set, then all lanes are empty
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ movdqa xmm0, [state + _lens + 0*16]
+ movdqa xmm1, [state + _lens + 1*16]
+
+ movdqa xmm2, xmm0
+ pminud xmm2, xmm1 ; xmm2 has {D,C,B,A}
+ palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ pminud xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ pminud xmm2, xmm3 ; xmm2 has min value in low dword
+
+ movd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ pand xmm2, [rel clear_low_nibble]
+ pshufd xmm2, xmm2, 0
+
+ psubd xmm0, xmm2
+ psubd xmm1, xmm2
+
+ movdqa [state + _lens + 0*16], xmm0
+ movdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*32]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..f41e5efbc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_avx2(MD5_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes[0] = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < 16; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..5ff02aa76
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c
@@ -0,0 +1,44 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_avx512(MD5_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes[0] = 0x0706050403020100;
+ state->unused_lanes[1] = 0x0f0e0d0c0b0a0908;
+ state->unused_lanes[2] = 0x1716151413121110;
+ state->unused_lanes[3] = 0x1f1e1d1c1b1a1918;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < 32; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c
new file mode 100644
index 000000000..615cd9d76
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_sse(MD5_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes[0] = 0xF76543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < 8; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..96adcf614
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm
@@ -0,0 +1,228 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+extern md5_mb_x4x2_avx
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_avx
+%define idx r8
+
+%define p r9
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane r10
+
+%define lane_data r11
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global md5_mb_mgr_submit_avx, function
+md5_mb_mgr_submit_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovd [state + _args_digest + 4*lane + 0*32], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..ed9b0588e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm
@@ -0,0 +1,239 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x8x2_avx2
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define lane rsi
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%define lane rdx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in a register not clobberred by md5_mb_x8x2_avx2
+%define idx rbp
+
+%define p r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define num_lanes_inuse r9
+
+%define lane_data r10
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global md5_mb_mgr_submit_avx2, function
+md5_mb_mgr_submit_avx2:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovd [state + _args_digest + 4*lane + 0*64], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x8x2_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..1bbc2be2c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
@@ -0,0 +1,283 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern md5_mb_x16x2_avx512
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define lane rsi
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%define lane rdx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in a register not clobberred by md5_mb_x16_avx512
+%define idx rbp
+
+%define p r11
+
+%define unused_lanes ymm7
+
+%define job_rax rax
+%define len rax
+
+%define num_lanes_inuse r9
+
+%define lane_data r10
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+;; Byte shift in MEM addr, read a extra byte [addr+16]
+%macro MEM_VPSRLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr + 1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr + 31], byte 0
+%endmacro
+
+;; Byte shift in MEM addr, read a extra byte [addr-1]
+%macro MEM_VPSLLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr-1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr], byte 0
+%endmacro
+
+align 64
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global md5_mb_mgr_submit_avx512, function
+md5_mb_mgr_submit_avx512:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov lane, [state + _unused_lanes]
+ and lane, 0x3F
+ MEM_VPSRLDDQ (state + _unused_lanes), unused_lanes
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(len), [job + _len]
+
+ shl len, 6 ; low 5 bits store idx
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*16*2], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*16*2], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*16*2], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*16*2], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 32
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ ; Find min length
+ vmovdqu ymm5, [state + _lens + 2*32]
+ vmovdqu ymm6, [state + _lens + 3*32]
+
+ vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+ vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+
+ vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0x3F
+ shr len2, 6
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_6bits]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+ vpsubd ymm5, ymm5, ymm2
+ vpsubd ymm6, ymm6, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+ vmovdqu [state + _lens + 2*32], ymm5
+ vmovdqu [state + _lens + 3*32], ymm6
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x16x2_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov lane, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+
+ shl lane, 8
+ or lane, idx
+ MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes
+ mov [state + _unused_lanes], lane
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=32
+
+align 32
+clear_low_6bits:
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_mgr_submit_avx512
+no_md5_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..2a374c7e3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm
@@ -0,0 +1,229 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_sse
+
+[bits 64]
+default rel
+section .text
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_sse
+%define idx r8
+
+%define p r9
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane r10
+
+%define lane_data r11
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global md5_mb_mgr_submit_sse, function
+md5_mb_mgr_submit_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ movdqa [rsp + 8*8 + 16*0], xmm6
+ movdqa [rsp + 8*8 + 16*1], xmm7
+ movdqa [rsp + 8*8 + 16*2], xmm8
+ movdqa [rsp + 8*8 + 16*3], xmm9
+ movdqa [rsp + 8*8 + 16*4], xmm10
+ movdqa [rsp + 8*8 + 16*5], xmm11
+ movdqa [rsp + 8*8 + 16*6], xmm12
+ movdqa [rsp + 8*8 + 16*7], xmm13
+ movdqa [rsp + 8*8 + 16*8], xmm14
+ movdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqu xmm0, [job + _result_digest + 0*16]
+ movd [state + _args_digest + 4*lane + 0*32], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ movdqa xmm0, [state + _lens + 0*16]
+ movdqa xmm1, [state + _lens + 1*16]
+
+ movdqa xmm2, xmm0
+ pminud xmm2, xmm1 ; xmm2 has {D,C,B,A}
+ palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ pminud xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ pminud xmm2, xmm3 ; xmm2 has min value in low dword
+
+ movd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ pand xmm2, [rel clear_low_nibble]
+ pshufd xmm2, xmm2, 0
+
+ psubd xmm0, xmm2
+ psubd xmm1, xmm2
+
+ movdqa [state + _lens + 0*16], xmm0
+ movdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*32]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 8*8 + 16*0]
+ movdqa xmm7, [rsp + 8*8 + 16*1]
+ movdqa xmm8, [rsp + 8*8 + 16*2]
+ movdqa xmm9, [rsp + 8*8 + 16*3]
+ movdqa xmm10, [rsp + 8*8 + 16*4]
+ movdqa xmm11, [rsp + 8*8 + 16*5]
+ movdqa xmm12, [rsp + 8*8 + 16*6]
+ movdqa xmm13, [rsp + 8*8 + 16*7]
+ movdqa xmm14, [rsp + 8*8 + 16*8]
+ movdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c
new file mode 100644
index 000000000..bba868f1a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c
@@ -0,0 +1,159 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/md5.h>
+#include "md5_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ int ret;
+
+ printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ MD5(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Random buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ MD5(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c
new file mode 100644
index 000000000..d19246138
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c
@@ -0,0 +1,202 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+ int ret;
+
+ printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ md5_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ md5_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ md5_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run md5_mb test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ md5_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ md5_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c
new file mode 100644
index 000000000..2eab61dfa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c
@@ -0,0 +1,297 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*MD5_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*MD5_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS];
+
+extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+ int ret;
+
+ printf("multibinary_md5_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ md5_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_md5 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = md5_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = md5_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = md5_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ md5_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ md5_ctx_mgr_init(mgr);
+
+ // Run md5_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = MD5_BLOCK_SIZE +
+ MD5_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % MD5_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem, HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = md5_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = md5_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % MD5_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = md5_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c
new file mode 100644
index 000000000..4f84b6723
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c
@@ -0,0 +1,229 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "md5_mb.h"
+
+typedef uint32_t DigestMD5[MD5_DIGEST_NWORDS];
+
+#define MSGS 13
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "Test vector from febooti.com";
+static uint8_t msg2[] = "12345678901234567890" "12345678901234567890"
+ "12345678901234567890" "12345678901234567890";
+static uint8_t msg3[] = "";
+static uint8_t msg4[] = "abcdefghijklmnopqrstuvwxyz";
+static uint8_t msg5[] = "message digest";
+static uint8_t msg6[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789";
+static uint8_t msg7[] = "abc";
+static uint8_t msg8[] = "a";
+
+static uint8_t msg9[] = "";
+static uint8_t msgA[] = "abcdefghijklmnopqrstuvwxyz";
+static uint8_t msgB[] = "message digest";
+static uint8_t msgC[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789";
+static uint8_t msgD[] = "abc";
+
+static DigestMD5 expResultDigest1 = { 0x61b60a50, 0xfbb76d3c, 0xf5620cd3, 0x0f3d57ff };
+static DigestMD5 expResultDigest2 = { 0xa2f4ed57, 0x55c9e32b, 0x2eda49ac, 0x7ab60721 };
+static DigestMD5 expResultDigest3 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec };
+static DigestMD5 expResultDigest4 = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca };
+static DigestMD5 expResultDigest5 = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa };
+static DigestMD5 expResultDigest6 = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f };
+static DigestMD5 expResultDigest7 = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 };
+static DigestMD5 expResultDigest8 = { 0xb975c10c, 0xa8b6f1c0, 0xe299c331, 0x61267769 };
+
+static DigestMD5 expResultDigest9 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec };
+static DigestMD5 expResultDigestA = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca };
+static DigestMD5 expResultDigestB = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa };
+static DigestMD5 expResultDigestC = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f };
+static DigestMD5 expResultDigestD = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 };
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8, msg9,
+ msgA, msgB, msgC, msgD
+};
+
+static uint32_t *expResultDigest[MSGS] = {
+ expResultDigest1, expResultDigest2, expResultDigest3,
+ expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7, expResultDigest8, expResultDigest9,
+ expResultDigestA, expResultDigestB, expResultDigestC,
+ expResultDigestD
+};
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ md5_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i], msgs[i],
+ strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = md5_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = md5_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_md5 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..0ba50a1d2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c
@@ -0,0 +1,129 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/md5.h>
+#include "md5_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS];
+
+int main(void)
+{
+ int ret;
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ md5_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ MD5(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("md5_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_md5" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer md5 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm
new file mode 100644
index 000000000..718572638
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm
@@ -0,0 +1,853 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+
+;; code to compute double octal MD5 using AVX512
+
+;; Stack must be aligned to 64 bytes before call
+
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers zmm0-8, 14-31
+
+;; clobbers all GPRs other than arg1 and rbp
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+;; These are pointers to data block1 and block2 in the stack
+; which will ping pong back and forth
+%define DPTR1 rbx
+%define DPTR2 var2
+%define IDX var1
+%define TBL rax
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+;; Transposed Digest Storage
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define A1 zmm4
+%define B1 zmm5
+%define C1 zmm6
+%define D1 zmm7
+
+%define md5c zmm16
+
+%define MASK0 zmm17
+%define MASK1 zmm18
+
+%define TMP0 zmm20
+%define TMP1 zmm21
+
+
+;; Data are stored into the Wx after transposition
+%define W0 zmm8
+%define W1 zmm9
+%define W2 zmm10
+%define W3 zmm11
+%define W4 zmm12
+%define W5 zmm13
+%define W6 zmm14
+%define W7 zmm15
+
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define MD5_DIGEST_ROW_SIZE (16*4)
+%define APPEND(a,b) a %+ b
+%define APPEND3(a,b,c) a %+ b %+ c
+
+;; Temporary registers used during data transposition
+
+%define RESZ resb 64*
+;; Assume stack aligned to 64 bytes before call
+;; Therefore FRAMESIZE mod 64 must be 64-8 = 56
+struc STACK
+_DATA: RESZ 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: RESZ 8 ; stores Z_AA-Z_DD, Z_AA2-Z_DD2
+_TMPDIGEST: RESZ 2 ; stores Z_AA, Z_BB temporarily
+_RSP_SAVE: RESQ 1 ; original RSP
+endstruc
+
+%define Z_AA rsp + _DIGEST + 64*0
+%define Z_BB rsp + _DIGEST + 64*1
+%define Z_CC rsp + _DIGEST + 64*2
+%define Z_DD rsp + _DIGEST + 64*3
+%define Z_AA1 rsp + _DIGEST + 64*4
+%define Z_BB1 rsp + _DIGEST + 64*5
+%define Z_CC1 rsp + _DIGEST + 64*6
+%define Z_DD1 rsp + _DIGEST + 64*7
+
+%define MD5_DIGEST_ROW_SIZE (32*4)
+
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+%define rot11 7
+%define rot12 12
+%define rot13 17
+%define rot14 22
+%define rot21 5
+%define rot22 9
+%define rot23 14
+%define rot24 20
+%define rot31 4
+%define rot32 11
+%define rot33 16
+%define rot34 23
+%define rot41 6
+%define rot42 10
+%define rot43 15
+%define rot44 21
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
+
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+ vmovdqa32 %%r14, MASK0
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, MASK1
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, MASK0
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, MASK1
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, MASK0
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, MASK1
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, MASK0
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, MASK1
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, MASK0
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, MASK1
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, MASK0
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, MASK1
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, MASK0
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, MASK1
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, MASK0
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, MASK1
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_ARGS1 0
+%xdefine TMP_ D1
+%xdefine D1 C1
+%xdefine C1 B1
+%xdefine B1 A1
+%xdefine A1 TMP_
+%endm
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +Ft(B,C,D) +data +const), nrot)
+;;eg: PROCESS_LOOP MD5constx, Mdatax, F_IMMEDx, NROTx
+%macro PROCESS_LOOP 6
+%define %%MD5const %1
+%define %%data %2
+%define %%F_IMMED %3
+%define %%NROT %4
+%define %%TMP_PR0 %5
+%define %%TMP_PR1 %6
+ ; a=b+((a+Ft(b,c,d)+Mj+ti)<<s)
+
+ ; Ft
+ ; 0-15 Ft:F(X,Y,Z)=(X&Y)|((~X)&Z) 0xca
+ ; 16-31 Ft:G(X,Y,Z)=(X&Z)|(Y&(~Z)) 0xe4
+ ; 32-47 Ft:H(X,Y,Z)=X^Y^Z 0x96
+ ; 48-63 Ft:I(X,Y,Z)=Y^(X|(~Z)) 0x39
+
+ vpaddd A, A, %%MD5const
+ vpaddd A1, A1, %%MD5const
+ vpaddd A, A, [%%data]
+ vpaddd A1, A1, [%%data + 16*64]
+ vmovdqa32 %%TMP_PR0, B ; Copy B
+ vmovdqa32 %%TMP_PR1, B1 ; Copy B
+ vpternlogd %%TMP_PR0, C, D, %%F_IMMED
+ vpternlogd %%TMP_PR1, C1, D1, %%F_IMMED
+ vpaddd A, A, %%TMP_PR0
+ vpaddd A1, A1, %%TMP_PR1
+ vprold A, A, %%NROT
+ vprold A1, A1, %%NROT
+ vpaddd A, A, B
+ vpaddd A1, A1, B1
+
+ ROTATE_ARGS
+ ROTATE_ARGS1
+%endmacro
+
+align 64
+
+; void md5_mb_x16x2_avx512(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+
+local_func_decl(md5_mb_x16x2_avx512)
+md5_mb_x16x2_avx512:
+ endbranch
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -64
+ mov [rsp + _RSP_SAVE], rax
+
+ mov DPTR1, rsp
+ lea DPTR2, [rsp + 64*32]
+
+ ;; Load MD5 constant pointer to register
+ lea TBL, [MD5_TABLE]
+ vmovdqa32 MASK0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vmovdqa32 MASK1, [PSHUFFLE_TRANSPOSE16_MASK2]
+
+ ;; Preload input data from 16 segments.
+ xor IDX, IDX
+
+ ;; transpose input onto stack
+ ;; first 16 lanes read
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+ ;; first 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR1+_DATA+(0)*64],W0
+ vmovdqa32 [DPTR1+_DATA+(1)*64],W1
+ vmovdqa32 [DPTR1+_DATA+(2)*64],W2
+ vmovdqa32 [DPTR1+_DATA+(3)*64],W3
+ vmovdqa32 [DPTR1+_DATA+(4)*64],W4
+ vmovdqa32 [DPTR1+_DATA+(5)*64],W5
+ vmovdqa32 [DPTR1+_DATA+(6)*64],W6
+ vmovdqa32 [DPTR1+_DATA+(7)*64],W7
+ vmovdqa32 [DPTR1+_DATA+(8)*64],W8
+ vmovdqa32 [DPTR1+_DATA+(9)*64],W9
+ vmovdqa32 [DPTR1+_DATA+(10)*64],W10
+ vmovdqa32 [DPTR1+_DATA+(11)*64],W11
+ vmovdqa32 [DPTR1+_DATA+(12)*64],W12
+ vmovdqa32 [DPTR1+_DATA+(13)*64],W13
+ vmovdqa32 [DPTR1+_DATA+(14)*64],W14
+ vmovdqa32 [DPTR1+_DATA+(15)*64],W15
+
+ ;; second 16 lanes read
+ mov inp0, [IN + 16*8]
+ mov inp1, [IN + 17*8]
+ mov inp2, [IN + 18*8]
+ mov inp3, [IN + 19*8]
+ mov inp4, [IN + 20*8]
+ mov inp5, [IN + 21*8]
+ mov inp6, [IN + 22*8]
+ mov inp7, [IN + 23*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 24*8]
+ mov inp1, [IN + 25*8]
+ mov inp2, [IN + 26*8]
+ mov inp3, [IN + 27*8]
+ mov inp4, [IN + 28*8]
+ mov inp5, [IN + 29*8]
+ mov inp6, [IN + 30*8]
+ mov inp7, [IN + 31*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+ ;; second 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR1+_DATA+(16+0)*64],W0
+ vmovdqa32 [DPTR1+_DATA+(16+1)*64],W1
+ vmovdqa32 [DPTR1+_DATA+(16+2)*64],W2
+ vmovdqa32 [DPTR1+_DATA+(16+3)*64],W3
+ vmovdqa32 [DPTR1+_DATA+(16+4)*64],W4
+ vmovdqa32 [DPTR1+_DATA+(16+5)*64],W5
+ vmovdqa32 [DPTR1+_DATA+(16+6)*64],W6
+ vmovdqa32 [DPTR1+_DATA+(16+7)*64],W7
+ vmovdqa32 [DPTR1+_DATA+(16+8)*64],W8
+ vmovdqa32 [DPTR1+_DATA+(16+9)*64],W9
+ vmovdqa32 [DPTR1+_DATA+(16+10)*64],W10
+ vmovdqa32 [DPTR1+_DATA+(16+11)*64],W11
+ vmovdqa32 [DPTR1+_DATA+(16+12)*64],W12
+ vmovdqa32 [DPTR1+_DATA+(16+13)*64],W13
+ vmovdqa32 [DPTR1+_DATA+(16+14)*64],W14
+ vmovdqa32 [DPTR1+_DATA+(16+15)*64],W15
+
+ ;; Initialize digests
+ ;; vmovdqu32 replace vmovdqa32
+ vmovdqu32 A, [DIGEST + 0 * MD5_DIGEST_ROW_SIZE]
+ vmovdqu32 B, [DIGEST + 1 * MD5_DIGEST_ROW_SIZE]
+ vmovdqu32 C, [DIGEST + 2 * MD5_DIGEST_ROW_SIZE]
+ vmovdqu32 D, [DIGEST + 3 * MD5_DIGEST_ROW_SIZE]
+ ; Load the digest for each stream (9-16)
+ vmovdqu32 A1,[DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64]
+ vmovdqu32 B1,[DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64]
+ vmovdqu32 C1,[DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64]
+ vmovdqu32 D1,[DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64]
+
+.lloop:
+ ;; Increment IDX to point to next data block (64 bytes per block)
+ add IDX, 64
+
+ ; Save digests for later addition
+ vmovdqa32 [Z_AA], A
+ vmovdqa32 [Z_BB], B
+ vmovdqa32 [Z_CC], C
+ vmovdqa32 [Z_DD], D
+ vmovdqa32 [Z_AA1], A1
+ vmovdqa32 [Z_BB1], B1
+ vmovdqa32 [Z_CC1], C1
+ vmovdqa32 [Z_DD1], D1
+
+ sub SIZE, 1
+ je .LastLoop
+
+%assign I 0
+%assign I_fimm 0xCA
+%rep 16 ; 0<=I<=15
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data I
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+ ;; first 16 lanes read
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+
+%assign I 16
+%assign I_fimm 0xE4
+%rep 16 ; 16<=I<=31
+ %assign I_data ((5*I+1) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ;; first 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR2+_DATA+(0)*64],W0
+ vmovdqa32 [DPTR2+_DATA+(1)*64],W1
+ vmovdqa32 [DPTR2+_DATA+(2)*64],W2
+ vmovdqa32 [DPTR2+_DATA+(3)*64],W3
+ vmovdqa32 [DPTR2+_DATA+(4)*64],W4
+ vmovdqa32 [DPTR2+_DATA+(5)*64],W5
+ vmovdqa32 [DPTR2+_DATA+(6)*64],W6
+ vmovdqa32 [DPTR2+_DATA+(7)*64],W7
+ vmovdqa32 [DPTR2+_DATA+(8)*64],W8
+ vmovdqa32 [DPTR2+_DATA+(9)*64],W9
+ vmovdqa32 [DPTR2+_DATA+(10)*64],W10
+ vmovdqa32 [DPTR2+_DATA+(11)*64],W11
+ vmovdqa32 [DPTR2+_DATA+(12)*64],W12
+ vmovdqa32 [DPTR2+_DATA+(13)*64],W13
+ vmovdqa32 [DPTR2+_DATA+(14)*64],W14
+ vmovdqa32 [DPTR2+_DATA+(15)*64],W15
+
+%assign I 32
+%assign I_fimm 0x96
+%rep 16 ; 32<=I<=47
+ %assign I_data ((3*I+5) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ;; second 16 lanes read
+ mov inp0, [IN + 16*8]
+ mov inp1, [IN + 17*8]
+ mov inp2, [IN + 18*8]
+ mov inp3, [IN + 19*8]
+ mov inp4, [IN + 20*8]
+ mov inp5, [IN + 21*8]
+ mov inp6, [IN + 22*8]
+ mov inp7, [IN + 23*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 24*8]
+ mov inp1, [IN + 25*8]
+ mov inp2, [IN + 26*8]
+ mov inp3, [IN + 27*8]
+ mov inp4, [IN + 28*8]
+ mov inp5, [IN + 29*8]
+ mov inp6, [IN + 30*8]
+ mov inp7, [IN + 31*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+
+%assign I 48
+%assign I_fimm 0x39
+%rep 16 ; 48<=I<=63
+ %assign I_rotX (I/16+1)
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data ((7*I) % 16)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ;; second 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR2+_DATA+(16+0)*64],W0
+ vmovdqa32 [DPTR2+_DATA+(16+1)*64],W1
+ vmovdqa32 [DPTR2+_DATA+(16+2)*64],W2
+ vmovdqa32 [DPTR2+_DATA+(16+3)*64],W3
+ vmovdqa32 [DPTR2+_DATA+(16+4)*64],W4
+ vmovdqa32 [DPTR2+_DATA+(16+5)*64],W5
+ vmovdqa32 [DPTR2+_DATA+(16+6)*64],W6
+ vmovdqa32 [DPTR2+_DATA+(16+7)*64],W7
+ vmovdqa32 [DPTR2+_DATA+(16+8)*64],W8
+ vmovdqa32 [DPTR2+_DATA+(16+9)*64],W9
+ vmovdqa32 [DPTR2+_DATA+(16+10)*64],W10
+ vmovdqa32 [DPTR2+_DATA+(16+11)*64],W11
+ vmovdqa32 [DPTR2+_DATA+(16+12)*64],W12
+ vmovdqa32 [DPTR2+_DATA+(16+13)*64],W13
+ vmovdqa32 [DPTR2+_DATA+(16+14)*64],W14
+ vmovdqa32 [DPTR2+_DATA+(16+15)*64],W15
+
+ ; Add old digest
+ vpaddd A,A,[Z_AA]
+ vpaddd B,B,[Z_BB]
+ vpaddd C,C,[Z_CC]
+ vpaddd D,D,[Z_DD]
+ vpaddd A1,A1,[Z_AA1]
+ vpaddd B1,B1,[Z_BB1]
+ vpaddd C1,C1,[Z_CC1]
+ vpaddd D1,D1,[Z_DD1]
+
+ ; Swap DPTR1 and DPTR2
+ xchg DPTR1, DPTR2
+ ;; Proceed to processing of next block
+ jmp .lloop
+
+.LastLoop:
+%assign I 0
+%assign I_fimm 0xCA
+%rep 16 ; 0<=I<=15
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data I
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 16
+%assign I_fimm 0xE4
+%rep 16 ; 16<=I<=31
+ %assign I_data ((5*I+1) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 32
+%assign I_fimm 0x96
+%rep 16 ; 32<=I<=47
+ %assign I_data ((3*I+5) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 48
+%assign I_fimm 0x39
+%rep 16 ; 48<=I<=63
+ %assign I_rotX (I/16+1)
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data ((7*I) % 16)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,[Z_AA]
+ vpaddd B,B,[Z_BB]
+ vpaddd C,C,[Z_CC]
+ vpaddd D,D,[Z_DD]
+ vpaddd A1,A1,[Z_AA1]
+ vpaddd B1,B1,[Z_BB1]
+ vpaddd C1,C1,[Z_CC1]
+ vpaddd D1,D1,[Z_DD1]
+
+ ;; update into data pointers
+%assign I 0
+%rep 16
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ vmovdqu32 [DIGEST + 0*MD5_DIGEST_ROW_SIZE ], A
+ vmovdqu32 [DIGEST + 1*MD5_DIGEST_ROW_SIZE ], B
+ vmovdqu32 [DIGEST + 2*MD5_DIGEST_ROW_SIZE ], C
+ vmovdqu32 [DIGEST + 3*MD5_DIGEST_ROW_SIZE ], D
+ ; Store the digest for each stream (9-16)
+ vmovdqu32 [DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64], A1
+ vmovdqu32 [DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64], B1
+ vmovdqu32 [DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64], C1
+ vmovdqu32 [DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64], D1
+
+ mov rsp, [rsp + _RSP_SAVE]
+ ret
+
+section .data
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
+ dd 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
+ dd 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+ dd 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+ dd 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+ dd 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+ dd 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+ dd 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+ dd 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+ dd 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+ dd 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+ dd 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+ dd 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+ dd 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+ dd 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+ dd 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_x16x2_avx512
+no_md5_mb_x16x2_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm
new file mode 100644
index 000000000..afca137bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm
@@ -0,0 +1,783 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+;; code to compute octal MD5 using AVX
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ ;movdqa %%F,%%Z
+ vpxor %%F,%%Z, %%Y
+ vpand %%F,%%F,%%X
+ vpxor %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ ;movdqa %%F,%%Z
+ vpxor %%F,%%Z, %%Y
+ vpxor %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpcmpeqd %%F,%%F,%%F ; 0xFFFF
+ vpxor %%F,%%F,%%Z ; pnot %%Z
+ vpor %%F,%%F,%%X
+ vpxor %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ ;movdqa %%tmp, %%reg
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%data %12
+%define %%MD5const %13
+%define %%nrot %14
+
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, [%%data]
+ vpaddd %%A2, %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ vpaddd %%A, %%A, %%FUN
+ %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+ vpaddd %%A2, %%A2, %%FUN
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ vmovdqa %%TMP,[%%data]
+ vmovdqa %%TMP2,[%%data + 16*16]
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, %%TMP
+ vpaddd %%A2, %%A2, %%TMP2
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+ vpaddd %%A, %%A, %%FUN
+ vpaddd %%A2, %%A2, %%FUN2
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP2
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4 ; tmp
+%define F xmm5 ; tmp
+
+%define A2 xmm6
+%define B2 xmm7
+%define C2 xmm8
+%define D2 xmm9
+
+
+%define FUN E
+%define TMP F
+%define FUN2 xmm10
+%define TMP2 xmm11
+
+%define T0 xmm10
+%define T1 xmm11
+%define T2 xmm12
+%define T3 xmm13
+%define T4 xmm14
+%define T5 xmm15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define inp7 rcx
+%define mem1 rdx
+%else
+;; Windows Registers
+%define arg1 rcx
+%define arg2 rdx
+%define inp7 rdi
+%define mem1 rsi
+%endif
+; r8 is not used
+
+; Common definitions
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define TBL rax
+%define IDX rbx
+%define mem2 rbp
+
+
+
+
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4 \
+; ... \
+; 300 data2[0] for lanes 7...4 \
+; 2F0 data2[15] for lanes 3...0 > mem block 2
+; ... /
+; 210 data2[1] for lanes 3...0 /
+; 200 data2[0] for lanes 3...0 /
+;
+; 1F0 data1[15] for lanes 7...4 \
+; ... \
+; 100 data1[0] for lanes 7...4 \
+; F0 data1[15] for lanes 3...0 > mem block 1
+; ... /
+; 10 data1[1] for lanes 3...0 /
+; 0 data1[0] for lanes 3...0 /
+
+MEM equ 16*16*2*2 ; two blocks of data stored in stack
+; STACK_SIZE must be an odd multiple of 8 bytes in size
+STACK_SIZE equ MEM + 16*8 + 8
+
+%define AA rsp + MEM + 16*0
+%define BB rsp + MEM + 16*1
+%define CC rsp + MEM + 16*2
+%define DD rsp + MEM + 16*3
+%define AA2 rsp + MEM + 16*4
+%define BB2 rsp + MEM + 16*5
+%define CC2 rsp + MEM + 16*6
+%define DD2 rsp + MEM + 16*7
+
+;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word
+
+;#define NUM_MD5_DIGEST_WORDS 4
+;#define NUM_LANES 8
+;#define MD5_BLOCK_SIZE 64
+;
+;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES];
+;
+;typedef struct {
+; DECLARE_ALIGNED(digest_array digest, 16);
+; UINT8* data_ptr[NUM_LANES];
+;} MD5_ARGS_X8;
+
+; void md5_mb_x4x2_avx(MD5_ARGS_X8 *args, UINT64 size)
+; arg 1 : pointer to MD5_ARGS_X8 structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+;
+; arg1 and r8 are maintained by this function
+;
+align 32
+mk_global md5_mb_x4x2_avx, function, internal
+md5_mb_x4x2_avx:
+ endbranch
+ sub rsp, STACK_SIZE
+
+ ;; Initialize digests
+ vmovdqu A,[arg1+0*16]
+ vmovdqu B,[arg1+2*16]
+ vmovdqu C,[arg1+4*16]
+ vmovdqu D,[arg1+6*16]
+
+ vmovdqu A2,[arg1+1*16]
+ vmovdqu B2,[arg1+3*16]
+ vmovdqu C2,[arg1+5*16]
+ vmovdqu D2,[arg1+7*16]
+
+ lea TBL, [MD5_TABLE]
+
+ ;; load input pointers
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+ mov inp4,[arg1 + _data_ptr + 4*8]
+ mov inp5,[arg1 + _data_ptr + 5*8]
+ mov inp6,[arg1 + _data_ptr + 6*8]
+ mov inp7,[arg1 + _data_ptr + 7*8]
+
+ xor IDX, IDX
+
+ ; Make ping-pong pointers to the two memory blocks
+ mov mem1, rsp
+ lea mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16],T0
+ vmovdqa [mem1+(I*4+1)*16],T1
+ vmovdqa [mem1+(I*4+2)*16],T2
+ vmovdqa [mem1+(I*4+3)*16],T3
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem1+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem1+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+
+ ; save old digests
+ vmovdqa [AA], A
+ vmovdqa [BB], B
+ vmovdqa [CC], C
+ vmovdqa [DD], D
+ ; save old digests
+ vmovdqa [AA2], A2
+ vmovdqa [BB2], B2
+ vmovdqa [CC2], C2
+ vmovdqa [DD2], D2
+
+ add IDX, 4*16
+ sub arg2, 1
+ je lastblock
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+ 0*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 1*16, [TBL+ 1*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+ 2*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 3*16, [TBL+ 3*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+ 4*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 5*16, [TBL+ 5*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+ 6*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+ 8*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 9*16, [TBL+ 9*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+10*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+11*16, [TBL+11*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+12*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+13*16, [TBL+13*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+14*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+15*16, [TBL+15*16], rot14
+
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+16*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 6*16, [TBL+17*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+18*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 0*16, [TBL+19*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+20*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+10*16, [TBL+21*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+22*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 4*16, [TBL+23*16], rot24
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+24*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+14*16, [TBL+25*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+26*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 8*16, [TBL+27*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+28*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 2*16, [TBL+29*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+30*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+12*16, [TBL+31*16], rot24
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+32*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 8*16, [TBL+33*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+34*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+14*16, [TBL+35*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+36*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 4*16, [TBL+37*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+38*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+10*16, [TBL+39*16], rot34
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+40*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 0*16, [TBL+41*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+42*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 6*16, [TBL+43*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+44*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+12*16, [TBL+45*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+46*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 2*16, [TBL+47*16], rot34
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+48*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 7*16, [TBL+49*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+50*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 5*16, [TBL+51*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+52*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 3*16, [TBL+53*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+54*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 1*16, [TBL+55*16], rot44
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+56*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+15*16, [TBL+57*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+58*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+13*16, [TBL+59*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+60*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+11*16, [TBL+61*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+62*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 9*16, [TBL+63*16], rot44
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; swap mem1 and mem2
+ xchg mem1, mem2
+
+ jmp lloop
+
+lastblock:
+
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+ 0*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+ 1*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+ 2*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+ 3*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+ 4*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+ 5*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+ 6*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+ 7*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+ 8*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+ 9*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+10*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+11*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+12*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+13*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+14*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+15*16], rot14
+
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+16*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+17*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+18*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+19*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+20*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+21*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+22*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+23*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+24*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+25*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+26*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+27*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+28*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+29*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+30*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+31*16], rot24
+
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+32*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+33*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+34*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+35*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+36*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+37*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+38*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+39*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+40*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+41*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+42*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+43*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+44*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+45*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+46*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+47*16], rot34
+
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+48*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+49*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+50*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+51*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+52*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+53*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+54*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+55*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+56*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+57*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+58*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+59*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+60*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+61*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+62*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+63*16], rot44
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; write out digests
+ vmovdqu [arg1+0*16], A
+ vmovdqu [arg1+2*16], B
+ vmovdqu [arg1+4*16], C
+ vmovdqu [arg1+6*16], D
+
+ vmovdqu [arg1+1*16], A2
+ vmovdqu [arg1+3*16], B2
+ vmovdqu [arg1+5*16], C2
+ vmovdqu [arg1+7*16], D2
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ mov [arg1 + _data_ptr + 1*8], inp1
+ mov [arg1 + _data_ptr + 2*8], inp2
+ mov [arg1 + _data_ptr + 3*8], inp3
+ mov [arg1 + _data_ptr + 4*8], inp4
+ mov [arg1 + _data_ptr + 5*8], inp5
+ mov [arg1 + _data_ptr + 6*8], inp6
+ mov [arg1 + _data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ add rsp, STACK_SIZE
+
+ ret
+
+section .data align=64
+
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm
new file mode 100644
index 000000000..b3b946634
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm
@@ -0,0 +1,779 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+;; code to compute octal MD5 using SSE
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movdqa %%t0, %%r0
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movdqa %%t1, %%r2
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movdqa %%r1, %%t0
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movdqa %%r3, %%r0
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,%%Y
+ pand %%F,%%X
+ pxor %%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,%%Y
+ pxor %%F,%%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ pcmpeqd %%F,%%F
+ pxor %%F,%%Z ; pnot %%Z
+ por %%F,%%X
+ pxor %%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%tmp, (32-%%imm)
+ pslld %%reg, %%imm
+ por %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%data %12
+%define %%MD5const %13
+%define %%nrot %14
+
+ paddd %%A, %%MD5const
+ paddd %%A2, %%MD5const
+ paddd %%A, [%%data]
+ paddd %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ paddd %%A, %%FUN
+ %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+ paddd %%A2, %%FUN
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP
+ paddd %%A, %%B
+ paddd %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ paddd %%A, %%MD5const
+ paddd %%A2, %%MD5const
+ paddd %%A, [%%data]
+ paddd %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+ paddd %%A, %%FUN
+ paddd %%A2, %%FUN2
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP2
+ paddd %%A, %%B
+ paddd %%A2, %%B2
+%endmacro
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4 ; tmp
+%define F xmm5 ; tmp
+
+%define A2 xmm6
+%define B2 xmm7
+%define C2 xmm8
+%define D2 xmm9
+
+
+%define FUN E
+%define TMP F
+%define FUN2 xmm10
+%define TMP2 xmm11
+
+%define T0 xmm10
+%define T1 xmm11
+%define T2 xmm12
+%define T3 xmm13
+%define T4 xmm14
+%define T5 xmm15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define inp7 rcx
+%define mem1 rdx
+%else
+;; Windows Registers
+%define arg1 rcx
+%define arg2 rdx
+%define inp7 rdi
+%define mem1 rsi
+%endif
+; r8 is not used
+
+; Common definitions
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+
+%define TBL rax
+%define IDX rbx
+%define mem2 rbp
+
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4 \
+; ... \
+; 300 data2[0] for lanes 7...4 \
+; 2F0 data2[15] for lanes 3...0 > mem block 2
+; ... /
+; 210 data2[1] for lanes 3...0 /
+; 200 data2[0] for lanes 3...0 /
+;
+; 1F0 data1[15] for lanes 7...4 \
+; ... \
+; 100 data1[0] for lanes 7...4 \
+; F0 data1[15] for lanes 3...0 > mem block 1
+; ... /
+; 10 data1[1] for lanes 3...0 /
+; 0 data1[0] for lanes 3...0 /
+
+MEM equ 16*16*2*2 ; two blocks of data stored in stack
+; STACK_SIZE must be an odd multiple of 8 bytes in size
+STACK_SIZE equ MEM + 16*8 + 8
+
+%define AA rsp + MEM + 16*0
+%define BB rsp + MEM + 16*1
+%define CC rsp + MEM + 16*2
+%define DD rsp + MEM + 16*3
+%define AA2 rsp + MEM + 16*4
+%define BB2 rsp + MEM + 16*5
+%define CC2 rsp + MEM + 16*6
+%define DD2 rsp + MEM + 16*7
+
+;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word
+
+;#define NUM_MD5_DIGEST_WORDS 4
+;#define NUM_LANES 8
+;#define MD5_BLOCK_SIZE 64
+;
+;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES];
+;
+;typedef struct {
+; DECLARE_ALIGNED(digest_array digest, 16);
+; UINT8* data_ptr[NUM_LANES];
+;} MD5_ARGS_X8;
+
+; void md5_mb_x4x2_sse(MD5_ARGS_X8 *args, UINT64 size)
+; arg 1 : pointer to MD5_ARGS_X8 structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+;
+; arg1 and r8 are maintained by this function
+;
+align 32
+mk_global md5_mb_x4x2_sse, function, internal
+md5_mb_x4x2_sse:
+ endbranch
+ sub rsp, STACK_SIZE
+
+ ;; Initialize digests
+ movdqu A,[arg1+0*16]
+ movdqu B,[arg1+2*16]
+ movdqu C,[arg1+4*16]
+ movdqu D,[arg1+6*16]
+
+ ;; Initialize digests
+ movdqu A2,[arg1+1*16]
+ movdqu B2,[arg1+3*16]
+ movdqu C2,[arg1+5*16]
+ movdqu D2,[arg1+7*16]
+
+ lea TBL, [MD5_TABLE]
+
+ ;; load input pointers
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+ mov inp4,[arg1 + _data_ptr + 4*8]
+ mov inp5,[arg1 + _data_ptr + 5*8]
+ mov inp6,[arg1 + _data_ptr + 6*8]
+ mov inp7,[arg1 + _data_ptr + 7*8]
+ xor IDX, IDX
+
+ ; Make ping-pong pointers to the two memory blocks
+ mov mem1, rsp
+ lea mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem1+(I*4+0)*16],T0
+ movdqa [mem1+(I*4+1)*16],T1
+ movdqa [mem1+(I*4+2)*16],T2
+ movdqa [mem1+(I*4+3)*16],T3
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem1+(I*4+0)*16 + 16*16],T0
+ movdqa [mem1+(I*4+1)*16 + 16*16],T1
+ movdqa [mem1+(I*4+2)*16 + 16*16],T2
+ movdqa [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+ ; save old digests
+ movdqa [AA], A
+ movdqa [BB], B
+ movdqa [CC], C
+ movdqa [DD], D
+ ; save old digests
+ movdqa [AA2], A2
+ movdqa [BB2], B2
+ movdqa [CC2], C2
+ movdqa [DD2], D2
+
+ add IDX, 4*16
+ sub arg2, 1
+ je lastblock
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
+
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+ paddd A,[AA]
+ paddd B,[BB]
+ paddd C,[CC]
+ paddd D,[DD]
+
+ paddd A2,[AA2]
+ paddd B2,[BB2]
+ paddd C2,[CC2]
+ paddd D2,[DD2]
+
+ ; swap mem1 and mem2
+ xchg mem1, mem2
+
+ jmp lloop
+
+lastblock:
+
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
+
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
+
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
+
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
+
+ paddd A,[AA]
+ paddd B,[BB]
+ paddd C,[CC]
+ paddd D,[DD]
+
+ paddd A2,[AA2]
+ paddd B2,[BB2]
+ paddd C2,[CC2]
+ paddd D2,[DD2]
+
+ ; write out digests
+ movdqu [arg1+0*16], A
+ movdqu [arg1+2*16], B
+ movdqu [arg1+4*16], C
+ movdqu [arg1+6*16], D
+ movdqu [arg1+1*16], A2
+ movdqu [arg1+3*16], B2
+ movdqu [arg1+5*16], C2
+ movdqu [arg1+7*16], D2
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ mov [arg1 + _data_ptr + 1*8], inp1
+ mov [arg1 + _data_ptr + 2*8], inp2
+ mov [arg1 + _data_ptr + 3*8], inp3
+ mov [arg1 + _data_ptr + 4*8], inp4
+ mov [arg1 + _data_ptr + 5*8], inp5
+ mov [arg1 + _data_ptr + 6*8], inp6
+ mov [arg1 + _data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ add rsp, STACK_SIZE
+
+ ret
+
+section .data align=64
+
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm
new file mode 100644
index 000000000..b5d6a4875
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm
@@ -0,0 +1,920 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute double octal MD5 using AVX2
+
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers ymm0-15
+
+;; clobbers all GPRs other than arg1 and rbp
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rdi
+ %define reg4 rsi
+%else
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%endif
+
+;; rbp is not clobbered
+
+%define state arg1
+%define num_blks arg2
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+;; These are pointers to data block1 and block2 in the stack
+; which will ping pong back and forth
+%define DPTR1 rbx
+%define DPTR2 reg3
+
+%define TBL rax
+%define IDX reg4
+
+;; Transposed Digest Storage
+%define Y_A ymm0
+%define Y_B ymm1
+%define Y_C ymm2
+%define Y_D ymm3
+%define Y_A2 ymm4
+%define Y_B2 ymm5
+%define Y_C2 ymm6
+%define Y_D2 ymm7
+
+;; Temp YMM registers corresponding to the Temp XMM registers
+;; used during the transposition of the digests
+%define Y_KTMP1 ymm12
+%define Y_KTMP2 ymm13
+;; Temporary registers used during MD5 round operations
+%define Y_FUN ymm8
+%define Y_TMP ymm9
+%define Y_FUN2 ymm10
+%define Y_TMP2 ymm11
+
+
+;; YMM registers used during data fetching.
+;; Data are stored into the stack after transposition
+%define Y_DAT0 ymm8
+%define Y_DAT1 ymm9
+%define Y_DAT2 ymm10
+%define Y_DAT3 ymm11
+%define Y_DAT4 ymm12
+%define Y_DAT5 ymm13
+%define Y_DAT6 ymm14
+%define Y_DAT7 ymm15
+
+;; Temporary registers used during data transposition
+%define Y_DTMP1 ymm0
+%define Y_DTMP2 ymm1
+
+
+%define RESY resb 32*
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
+struc STACK
+_DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2
+_TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily
+_RSP_SAVE: RESQ 1 ; original RSP
+endstruc
+
+
+%define Y_AA rsp + _DIGEST + 32*0
+%define Y_BB rsp + _DIGEST + 32*1
+%define Y_CC rsp + _DIGEST + 32*2
+%define Y_DD rsp + _DIGEST + 32*3
+%define Y_AA2 rsp + _DIGEST + 32*4
+%define Y_BB2 rsp + _DIGEST + 32*5
+%define Y_CC2 rsp + _DIGEST + 32*6
+%define Y_DD2 rsp + _DIGEST + 32*7
+
+%define MD5_DIGEST_ROW_SIZE (16*4)
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpand %%F,%%F,%%X
+ vpxor %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpxor %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpcmpeqd %%F,%%F,%%F ; 0xFFFF
+ vpxor %%F,%%F,%%Z ; pnot %%Z
+ vpor %%F,%%F,%%X
+ vpxor %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%rA %2
+%define %%rB %3
+%define %%rC %4
+%define %%rD %5
+%define %%rA2 %6
+%define %%rB2 %7
+%define %%rC2 %8
+%define %%rD2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ vpaddd %%rA, %%rA, %%MD5const
+ vpaddd %%rA2, %%rA2, %%MD5const
+ vpaddd %%rA, %%rA, [%%data]
+ vpaddd %%rA2, %%rA2, [%%data + 16*32]
+ %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD
+ %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2
+ vpaddd %%rA, %%rA, %%FUN
+ vpaddd %%rA2, %%rA2, %%FUN2
+ PROLD %%rA,%%nrot, %%TMP
+ PROLD %%rA2,%%nrot, %%TMP2
+ vpaddd %%rA, %%rA, %%rB
+ vpaddd %%rA2, %%rA2, %%rB2
+%endmacro
+
+align 32
+
+; void md5_mb_x8x2_avx2(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+
+mk_global md5_mb_x8x2_avx2, function, internal
+md5_mb_x8x2_avx2:
+ endbranch
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _RSP_SAVE], rax
+
+ mov DPTR1, rsp
+ lea DPTR2, [rsp + 32*32]
+
+ ;; Load MD5 constant pointer to register
+ lea TBL, [MD5_TABLE]
+
+ ; Initialize index for data retrieval
+ xor IDX, IDX
+
+ ;; Fetch Pointers to Data Stream 1 to 8
+ mov inp0,[state + _data_ptr + 0*8]
+ mov inp1,[state + _data_ptr + 1*8]
+ mov inp2,[state + _data_ptr + 2*8]
+ mov inp3,[state + _data_ptr + 3*8]
+ mov inp4,[state + _data_ptr + 4*8]
+ mov inp5,[state + _data_ptr + 5*8]
+ mov inp6,[state + _data_ptr + 6*8]
+ mov inp7,[state + _data_ptr + 7*8]
+
+%assign I 0
+%rep 2
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+
+ ;; Fetch Pointers to Data Stream 9 to 16
+ mov inp0,[state + _data_ptr + 8*8]
+ mov inp1,[state + _data_ptr + 9*8]
+ mov inp2,[state + _data_ptr + 10*8]
+ mov inp3,[state + _data_ptr + 11*8]
+ mov inp4,[state + _data_ptr + 12*8]
+ mov inp5,[state + _data_ptr + 13*8]
+ mov inp6,[state + _data_ptr + 14*8]
+ mov inp7,[state + _data_ptr + 15*8]
+
+%assign I 0
+%rep 2
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+ ;; digests are already transposed
+ vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ]
+
+ ; Load the digest for each stream (9-16)
+ vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32]
+
+lloop:
+
+ ; save old digests to stack
+ vmovdqa [Y_AA], Y_A
+ vmovdqa [Y_BB], Y_B
+ vmovdqa [Y_CC], Y_C
+ vmovdqa [Y_DD], Y_D
+
+ vmovdqa [Y_AA2], Y_A2
+ vmovdqa [Y_BB2], Y_B2
+ vmovdqa [Y_CC2], Y_C2
+ vmovdqa [Y_DD2], Y_D2
+
+ ;; Increment IDX to point to next data block (64 bytes per block)
+ add IDX, 64
+
+ ;; Update size of remaining blocks to process
+ sub num_blks, 1
+ je lastblock
+
+ ; Perform the 64 rounds of processing ...
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+
+
+ ;; Fetch Pointers to Data Stream 1 to 8 ??
+ mov inp0,[state + _data_ptr + 0*8]
+ mov inp1,[state + _data_ptr + 1*8]
+ mov inp2,[state + _data_ptr + 2*8]
+ mov inp3,[state + _data_ptr + 3*8]
+ mov inp4,[state + _data_ptr + 4*8]
+ mov inp5,[state + _data_ptr + 5*8]
+ mov inp6,[state + _data_ptr + 6*8]
+ mov inp7,[state + _data_ptr + 7*8]
+
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+%assign I 0
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+%assign I (I+1)
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+
+ ;; Fetch Pointers to Data Stream 9 to 16
+ mov inp0,[state + _data_ptr + 8*8]
+ mov inp1,[state + _data_ptr + 9*8]
+ mov inp2,[state + _data_ptr + 10*8]
+ mov inp3,[state + _data_ptr + 11*8]
+ mov inp4,[state + _data_ptr + 12*8]
+ mov inp5,[state + _data_ptr + 13*8]
+ mov inp6,[state + _data_ptr + 14*8]
+ mov inp7,[state + _data_ptr + 15*8]
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+%assign I 0
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+%assign I (I+1)
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ ; Add results to old digest values
+
+ vpaddd Y_A,Y_A,[Y_AA]
+ vpaddd Y_B,Y_B,[Y_BB]
+ vpaddd Y_C,Y_C,[Y_CC]
+ vpaddd Y_D,Y_D,[Y_DD]
+
+ vpaddd Y_A2,Y_A2,[Y_AA2]
+ vpaddd Y_B2,Y_B2,[Y_BB2]
+ vpaddd Y_C2,Y_C2,[Y_CC2]
+ vpaddd Y_D2,Y_D2,[Y_DD2]
+
+ ; Swap DPTR1 and DPTR2
+ xchg DPTR1, DPTR2
+
+ ;; Proceed to processing of next block
+ jmp lloop
+
+lastblock:
+
+ ; Perform the 64 rounds of processing ...
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [state + _data_ptr + (2*I)*8]
+ mov inp1, [state + _data_ptr + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [state + _data_ptr + (2*I)*8], inp0
+ mov [state + _data_ptr + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ vpaddd Y_A,Y_A,[Y_AA]
+ vpaddd Y_B,Y_B,[Y_BB]
+ vpaddd Y_C,Y_C,[Y_CC]
+ vpaddd Y_D,Y_D,[Y_DD]
+
+ vpaddd Y_A2,Y_A2,[Y_AA2]
+ vpaddd Y_B2,Y_B2,[Y_BB2]
+ vpaddd Y_C2,Y_C2,[Y_CC2]
+ vpaddd Y_D2,Y_D2,[Y_DD2]
+
+
+
+ vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A
+ vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B
+ vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C
+ vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D
+
+
+ vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width
+ vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2
+ vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2
+ vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2
+
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+
+
+ mov rsp, [rsp + _RSP_SAVE]
+
+ ret
+
+section .data
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm
new file mode 100644
index 000000000..6e31d297a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm
@@ -0,0 +1,80 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern md5_ctx_mgr_init_sse
+extern md5_ctx_mgr_submit_sse
+extern md5_ctx_mgr_flush_sse
+
+extern md5_ctx_mgr_init_avx
+extern md5_ctx_mgr_submit_avx
+extern md5_ctx_mgr_flush_avx
+
+extern md5_ctx_mgr_init_avx2
+extern md5_ctx_mgr_submit_avx2
+extern md5_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern md5_ctx_mgr_init_avx512
+ extern md5_ctx_mgr_submit_avx512
+ extern md5_ctx_mgr_flush_avx512
+%endif
+
+extern md5_ctx_mgr_init_base
+extern md5_ctx_mgr_submit_base
+extern md5_ctx_mgr_flush_base
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface md5_ctx_mgr_init
+mbin_interface md5_ctx_mgr_submit
+mbin_interface md5_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 md5_ctx_mgr_init, md5_ctx_mgr_init_base, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2, md5_ctx_mgr_init_avx512
+ mbin_dispatch_init6 md5_ctx_mgr_submit, md5_ctx_mgr_submit_base, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2, md5_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 md5_ctx_mgr_flush, md5_ctx_mgr_flush_base, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2, md5_ctx_mgr_flush_avx512
+%else
+ mbin_dispatch_init md5_ctx_mgr_init, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2
+ mbin_dispatch_init md5_ctx_mgr_submit, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2
+ mbin_dispatch_init md5_ctx_mgr_flush, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2
+%endif
+
+;; func core, ver, snum
+slversion md5_ctx_mgr_init, 00, 04, 0189
+slversion md5_ctx_mgr_submit, 00, 04, 018a
+slversion md5_ctx_mgr_flush, 00, 04, 018b
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c
new file mode 100644
index 000000000..ed4721107
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c
@@ -0,0 +1,186 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference MD5 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+static void OPT_FIX md5_single(const uint8_t * data, uint32_t digest[4]);
+
+#define H0 0x67452301
+#define H1 0xefcdab89
+#define H2 0x98badcfe
+#define H3 0x10325476
+
+void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[128];
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+
+ i = len;
+ while (i >= 64) {
+ md5_single(input_data, digest);
+ input_data += 64;
+ i -= 64;
+ }
+ // 0 <= i < 64
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < 120; j++)
+ buf[j] = 0;
+
+ if (i > 64 - 8)
+ i = 128;
+ else
+ i = 64;
+
+ *(uint64_t *) (buf + i - 8) = to_le64((uint64_t) len * 8);
+
+ md5_single(buf, digest);
+ if (i == 128)
+ md5_single(buf + 64, digest);
+}
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (c ^ (d & (b ^ c)))
+#define F3(b,c,d) (b ^ c ^ d)
+#define F4(b,c,d) (c ^ (b | ~d))
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define step(i,a,b,c,d,f,k,w,r) \
+ if (i < 16) {f = F1(b,c,d); } else \
+ if (i < 32) {f = F2(b,c,d); } else \
+ if (i < 48) {f = F3(b,c,d); } else \
+ {f = F4(b,c,d); } \
+ f = a + f + k + to_le32(w); \
+ a = b + rol32(f, r);
+
+void md5_single(const uint8_t * data, uint32_t digest[4])
+{
+ uint32_t a, b, c, d;
+ uint32_t f;
+ uint32_t *w = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+
+ step(0, a, b, c, d, f, 0xd76aa478, w[0], 7);
+ step(1, d, a, b, c, f, 0xe8c7b756, w[1], 12);
+ step(2, c, d, a, b, f, 0x242070db, w[2], 17);
+ step(3, b, c, d, a, f, 0xc1bdceee, w[3], 22);
+ step(4, a, b, c, d, f, 0xf57c0faf, w[4], 7);
+ step(5, d, a, b, c, f, 0x4787c62a, w[5], 12);
+ step(6, c, d, a, b, f, 0xa8304613, w[6], 17);
+ step(7, b, c, d, a, f, 0xfd469501, w[7], 22);
+ step(8, a, b, c, d, f, 0x698098d8, w[8], 7);
+ step(9, d, a, b, c, f, 0x8b44f7af, w[9], 12);
+ step(10, c, d, a, b, f, 0xffff5bb1, w[10], 17);
+ step(11, b, c, d, a, f, 0x895cd7be, w[11], 22);
+ step(12, a, b, c, d, f, 0x6b901122, w[12], 7);
+ step(13, d, a, b, c, f, 0xfd987193, w[13], 12);
+ step(14, c, d, a, b, f, 0xa679438e, w[14], 17);
+ step(15, b, c, d, a, f, 0x49b40821, w[15], 22);
+
+ step(16, a, b, c, d, f, 0xf61e2562, w[1], 5);
+ step(17, d, a, b, c, f, 0xc040b340, w[6], 9);
+ step(18, c, d, a, b, f, 0x265e5a51, w[11], 14);
+ step(19, b, c, d, a, f, 0xe9b6c7aa, w[0], 20);
+ step(20, a, b, c, d, f, 0xd62f105d, w[5], 5);
+ step(21, d, a, b, c, f, 0x02441453, w[10], 9);
+ step(22, c, d, a, b, f, 0xd8a1e681, w[15], 14);
+ step(23, b, c, d, a, f, 0xe7d3fbc8, w[4], 20);
+ step(24, a, b, c, d, f, 0x21e1cde6, w[9], 5);
+ step(25, d, a, b, c, f, 0xc33707d6, w[14], 9);
+ step(26, c, d, a, b, f, 0xf4d50d87, w[3], 14);
+ step(27, b, c, d, a, f, 0x455a14ed, w[8], 20);
+ step(28, a, b, c, d, f, 0xa9e3e905, w[13], 5);
+ step(29, d, a, b, c, f, 0xfcefa3f8, w[2], 9);
+ step(30, c, d, a, b, f, 0x676f02d9, w[7], 14);
+ step(31, b, c, d, a, f, 0x8d2a4c8a, w[12], 20);
+
+ step(32, a, b, c, d, f, 0xfffa3942, w[5], 4);
+ step(33, d, a, b, c, f, 0x8771f681, w[8], 11);
+ step(34, c, d, a, b, f, 0x6d9d6122, w[11], 16);
+ step(35, b, c, d, a, f, 0xfde5380c, w[14], 23);
+ step(36, a, b, c, d, f, 0xa4beea44, w[1], 4);
+ step(37, d, a, b, c, f, 0x4bdecfa9, w[4], 11);
+ step(38, c, d, a, b, f, 0xf6bb4b60, w[7], 16);
+ step(39, b, c, d, a, f, 0xbebfbc70, w[10], 23);
+ step(40, a, b, c, d, f, 0x289b7ec6, w[13], 4);
+ step(41, d, a, b, c, f, 0xeaa127fa, w[0], 11);
+ step(42, c, d, a, b, f, 0xd4ef3085, w[3], 16);
+ step(43, b, c, d, a, f, 0x04881d05, w[6], 23);
+ step(44, a, b, c, d, f, 0xd9d4d039, w[9], 4);
+ step(45, d, a, b, c, f, 0xe6db99e5, w[12], 11);
+ step(46, c, d, a, b, f, 0x1fa27cf8, w[15], 16);
+ step(47, b, c, d, a, f, 0xc4ac5665, w[2], 23);
+
+ step(48, a, b, c, d, f, 0xf4292244, w[0], 6);
+ step(49, d, a, b, c, f, 0x432aff97, w[7], 10);
+ step(50, c, d, a, b, f, 0xab9423a7, w[14], 15);
+ step(51, b, c, d, a, f, 0xfc93a039, w[5], 21);
+ step(52, a, b, c, d, f, 0x655b59c3, w[12], 6);
+ step(53, d, a, b, c, f, 0x8f0ccc92, w[3], 10);
+ step(54, c, d, a, b, f, 0xffeff47d, w[10], 15);
+ step(55, b, c, d, a, f, 0x85845dd1, w[1], 21);
+ step(56, a, b, c, d, f, 0x6fa87e4f, w[8], 6);
+ step(57, d, a, b, c, f, 0xfe2ce6e0, w[15], 10);
+ step(58, c, d, a, b, f, 0xa3014314, w[6], 15);
+ step(59, b, c, d, a, f, 0x4e0811a1, w[13], 21);
+ step(60, a, b, c, d, f, 0xf7537e82, w[4], 6);
+ step(61, d, a, b, c, f, 0xbd3af235, w[11], 10);
+ step(62, c, d, a, b, f, 0x2ad7d2bb, w[2], 15);
+ step(63, b, c, d, a, f, 0xeb86d391, w[9], 21);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am
new file mode 100644
index 000000000..696e9c57d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am
@@ -0,0 +1,83 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_mh_sha1_base = \
+ mh_sha1/mh_sha1_block_base.c \
+ mh_sha1/mh_sha1_finalize_base.c \
+ mh_sha1/mh_sha1_update_base.c \
+ mh_sha1/sha1_for_mh_sha1.c \
+ mh_sha1/mh_sha1.c
+
+lsrc_x86_64 += \
+ $(lsrc_mh_sha1_base) \
+ mh_sha1/mh_sha1_multibinary.asm \
+ mh_sha1/mh_sha1_block_sse.asm \
+ mh_sha1/mh_sha1_block_avx.asm \
+ mh_sha1/mh_sha1_block_avx2.asm \
+ mh_sha1/mh_sha1_block_avx512.asm \
+ mh_sha1/mh_sha1_avx512.c
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += \
+ $(lsrc_mh_sha1_base) \
+ mh_sha1/aarch64/mh_sha1_multibinary.S \
+ mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c \
+ mh_sha1/aarch64/mh_sha1_block_asimd.S \
+ mh_sha1/aarch64/mh_sha1_asimd.c \
+ mh_sha1/aarch64/mh_sha1_block_ce.S \
+ mh_sha1/aarch64/mh_sha1_ce.c
+
+lsrc_base_aliases += \
+ $(lsrc_mh_sha1_base) \
+ mh_sha1/mh_sha1_base_aliases.c
+
+other_src += mh_sha1/mh_sha1_ref.c \
+ include/reg_sizes.asm \
+ include/multibinary.asm \
+ include/test.h \
+ mh_sha1/mh_sha1_internal.h
+
+src_include += -I $(srcdir)/mh_sha1
+
+extern_hdrs += include/mh_sha1.h
+
+check_tests += mh_sha1/mh_sha1_test
+unit_tests += mh_sha1/mh_sha1_update_test
+
+perf_tests += mh_sha1/mh_sha1_perf
+
+
+mh_sha1_test: mh_sha1_ref.o
+mh_sha1_mh_sha1_test_LDADD = mh_sha1/mh_sha1_ref.lo libisal_crypto.la
+
+mh_sha1_update_test: mh_sha1_ref.o
+mh_sha1_mh_sha1_update_test_LDADD = mh_sha1/mh_sha1_ref.lo libisal_crypto.la
+
+mh_sha1_mh_sha1_perf_LDADD = libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c
new file mode 100644
index 000000000..2ad8871fa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_aarch64_dispatcher.c
@@ -0,0 +1,55 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha1_update)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(mh_sha1_update_ce);
+
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(mh_sha1_update_asimd);
+
+ return PROVIDER_BASIC(mh_sha1_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha1_finalize)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(mh_sha1_finalize_ce);
+
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(mh_sha1_finalize_asimd);
+
+ return PROVIDER_BASIC(mh_sha1_finalize);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c
new file mode 100644
index 000000000..c913a64df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_asimd.c
@@ -0,0 +1,53 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+void mh_sha1_block_asimd(const uint8_t * input_data,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+/***************mh_sha1_update***********/
+// mh_sha1_update_asimd.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_asimd
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_asimd
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+// mh_sha1_finalize_asimd.c and mh_sha1_tail_asimd.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_asimd
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_asimd
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_asimd
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S
new file mode 100644
index 000000000..22f716f27
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_asimd.S
@@ -0,0 +1,124 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+#include "sha1_asimd_common.S"
+
+.macro load_x4_word idx:req
+ ld1 {WORD\idx\().16b},[segs_ptr]
+ add segs_ptr,segs_ptr,#64
+.endm
+
+/*
+ * void mh_sha1_block_asimd (const uint8_t * input_data,
+ * uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ * uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ * uint32_t num_blocks);
+ * arg 0 pointer to input data
+ * arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+ * arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+ * arg 3 number of 1KB blocks
+ */
+
+ input_data .req x0
+ sha1_digest .req x1
+ data_buf .req x2
+ num_blocks .req w3
+ src .req x4
+ dst .req x5
+ offs .req x6
+ mh_segs .req x7
+ tmp .req x8
+ segs_ptr .req x9
+ block_ctr .req w10
+
+ .global mh_sha1_block_asimd
+ .type mh_sha1_block_asimd, %function
+mh_sha1_block_asimd:
+ cmp num_blocks, #0
+ beq .return
+ sha1_asimd_save_stack
+
+ mov mh_segs, #0
+.seg_loops:
+ add segs_ptr,input_data,mh_segs
+ mov offs, #64
+ add src, sha1_digest, mh_segs
+ ld1 {VA.4S}, [src], offs
+ ld1 {VB.4S}, [src], offs
+ ld1 {VC.4S}, [src], offs
+ ld1 {VD.4S}, [src], offs
+ ld1 {VE.4S}, [src], offs
+ mov block_ctr,num_blocks
+
+.block_loop:
+ sha1_single
+ subs block_ctr, block_ctr, 1
+ bne .block_loop
+
+ mov offs, #64
+ add dst, sha1_digest, mh_segs
+ st1 {VA.4S}, [dst], offs
+ st1 {VB.4S}, [dst], offs
+ st1 {VC.4S}, [dst], offs
+ st1 {VD.4S}, [dst], offs
+ st1 {VE.4S}, [dst], offs
+
+ add mh_segs, mh_segs, #16
+ cmp mh_segs, #64
+ bne .seg_loops
+
+ sha1_asimd_restore_stack
+.return:
+ ret
+
+ .size mh_sha1_block_asimd, .-mh_sha1_block_asimd
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+KEY_0:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+KEY_1:
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+KEY_2:
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+KEY_3:
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S
new file mode 100644
index 000000000..12d3c5df2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_block_ce.S
@@ -0,0 +1,384 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg lane0_msg_0, 0
+ declare_var_vector_reg lane1_msg_0, 1
+ declare_var_vector_reg lane2_msg_0, 2
+ declare_var_vector_reg lane3_msg_0, 3
+ declare_var_vector_reg lane0_msg_1, 4
+ declare_var_vector_reg lane1_msg_1, 5
+ declare_var_vector_reg lane2_msg_1, 6
+ declare_var_vector_reg lane3_msg_1, 7
+ declare_var_vector_reg lane0_msg_2, 8
+ declare_var_vector_reg lane1_msg_2, 9
+ declare_var_vector_reg lane2_msg_2,10
+ declare_var_vector_reg lane3_msg_2,11
+ declare_var_vector_reg lane0_msg_3,12
+ declare_var_vector_reg lane1_msg_3,13
+ declare_var_vector_reg lane2_msg_3,14
+ declare_var_vector_reg lane3_msg_3,15
+
+ declare_var_vector_reg lane0_abcd ,16
+ declare_var_vector_reg lane1_abcd ,17
+ declare_var_vector_reg lane2_abcd ,18
+ declare_var_vector_reg lane3_abcd ,19
+ declare_var_vector_reg lane0_tmp0 ,20
+ declare_var_vector_reg lane1_tmp0 ,21
+ declare_var_vector_reg lane2_tmp0 ,22
+ declare_var_vector_reg lane3_tmp0 ,23
+ declare_var_vector_reg lane0_tmp1 ,24
+ declare_var_vector_reg lane1_tmp1 ,25
+ declare_var_vector_reg lane2_tmp1 ,26
+ declare_var_vector_reg lane3_tmp1 ,27
+
+
+ declare_var_vector_reg e0 ,28
+ declare_var_vector_reg e1 ,29
+ declare_var_vector_reg key ,30
+ declare_var_vector_reg tmp ,31
+
+ key_adr .req x4
+ msg_adr .req x5
+ block_cnt .req x6
+ offs .req x7
+ digest_adr .req x16
+ tmp0_adr .req x17
+ tmp1_adr .req x18
+
+/**
+maros for round 4-67
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req
+ sha1h lane0_\tmp0\()_s, lane0_\abcd\()_s
+ sha1h lane1_\tmp0\()_s, lane1_\abcd\()_s
+ sha1h lane2_\tmp0\()_s, lane2_\abcd\()_s
+ sha1h lane3_\tmp0\()_s, lane3_\abcd\()_s
+ mov \e0\()_v.S[0],lane0_\tmp0\()_v.S[0]
+ mov \e0\()_v.S[1],lane1_\tmp0\()_v.S[0]
+ mov \e0\()_v.S[2],lane2_\tmp0\()_v.S[0]
+ mov \e0\()_v.S[3],lane3_\tmp0\()_v.S[0]
+ mov lane0_\tmp0\()_v.S[0],\e1\()_v.S[0]
+ mov lane1_\tmp0\()_v.S[0],\e1\()_v.S[1]
+ mov lane2_\tmp0\()_v.S[0],\e1\()_v.S[2]
+ mov lane3_\tmp0\()_v.S[0],\e1\()_v.S[3]
+ \inst lane0_\abcd\()_q,lane0_\tmp0\()_s,lane0_\tmp1\()_v.4s
+ \inst lane1_\abcd\()_q,lane1_\tmp0\()_s,lane1_\tmp1\()_v.4s
+ \inst lane2_\abcd\()_q,lane2_\tmp0\()_s,lane2_\tmp1\()_v.4s
+ \inst lane3_\abcd\()_q,lane3_\tmp0\()_s,lane3_\tmp1\()_v.4s
+ ld1 {lane0_\tmp0\()_v.4s-lane3_\tmp0\()_v.4s},[\tmp0\()_adr]
+ add lane0_\tmp1\()_v.4s,lane0_\msg3\()_v.4s,key_v.4s
+ add lane1_\tmp1\()_v.4s,lane1_\msg3\()_v.4s,key_v.4s
+ add lane2_\tmp1\()_v.4s,lane2_\msg3\()_v.4s,key_v.4s
+ add lane3_\tmp1\()_v.4s,lane3_\msg3\()_v.4s,key_v.4s
+ st1 {lane0_\tmp1\()_v.4s-lane3_\tmp1\()_v.4s},[\tmp1\()_adr]
+ sha1su1 lane0_\msg0\()_v.4s,lane0_\msg3\()_v.4s
+ sha1su1 lane1_\msg0\()_v.4s,lane1_\msg3\()_v.4s
+ sha1su1 lane2_\msg0\()_v.4s,lane2_\msg3\()_v.4s
+ sha1su1 lane3_\msg0\()_v.4s,lane3_\msg3\()_v.4s
+ sha1su0 lane0_\msg1\()_v.4s,lane0_\msg2\()_v.4s,lane0_\msg3\()_v.4s
+ sha1su0 lane1_\msg1\()_v.4s,lane1_\msg2\()_v.4s,lane1_\msg3\()_v.4s
+ sha1su0 lane2_\msg1\()_v.4s,lane2_\msg2\()_v.4s,lane2_\msg3\()_v.4s
+ sha1su0 lane3_\msg1\()_v.4s,lane3_\msg2\()_v.4s,lane3_\msg3\()_v.4s
+
+.endm
+
+
+/*
+ void mh_sha1_block_ce(const uint8_t * input_data,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks)
+*/
+/*
+Arguements list
+*/
+ input_data .req x0
+ digests .req x1
+ frame_buffer .req x2
+ num_blocks .req w3
+
+ .global mh_sha1_block_ce
+ .type mh_sha1_block_ce, %function
+mh_sha1_block_ce:
+ //save temp vector registers
+ stp d8, d9, [sp, -128]!
+
+ stp d10, d11, [sp, 16]
+ stp d12, d13, [sp, 32]
+ stp d14, d15, [sp, 48]
+ mov tmp0_adr,frame_buffer
+ add tmp1_adr,tmp0_adr,128
+
+
+start_loop:
+ mov block_cnt,0
+ mov msg_adr,input_data
+lane_loop:
+ mov offs,64
+ adr key_adr,KEY_0
+ //load msg 0
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[msg_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[msg_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[msg_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[msg_adr],offs
+
+ ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[msg_adr],offs
+ ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[1],[msg_adr],offs
+ ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[2],[msg_adr],offs
+ ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[3],[msg_adr],offs
+
+ ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[0],[msg_adr],offs
+ ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[1],[msg_adr],offs
+ ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[2],[msg_adr],offs
+ ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[3],[msg_adr],offs
+
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[msg_adr],offs
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[1],[msg_adr],offs
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[2],[msg_adr],offs
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[3],[msg_adr],offs
+
+ add digest_adr,digests,block_cnt
+ ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
+ ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
+ ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
+ ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
+ ldr e0_q,[digest_adr]
+
+ //load key_0
+ ldr key_q,[key_adr]
+
+ rev32 lane0_msg_0_v.16b,lane0_msg_0_v.16b
+ rev32 lane1_msg_0_v.16b,lane1_msg_0_v.16b
+ rev32 lane2_msg_0_v.16b,lane2_msg_0_v.16b
+ rev32 lane3_msg_0_v.16b,lane3_msg_0_v.16b
+ rev32 lane0_msg_1_v.16b,lane0_msg_1_v.16b
+ rev32 lane1_msg_1_v.16b,lane1_msg_1_v.16b
+ rev32 lane2_msg_1_v.16b,lane2_msg_1_v.16b
+ rev32 lane3_msg_1_v.16b,lane3_msg_1_v.16b
+ rev32 lane0_msg_2_v.16b,lane0_msg_2_v.16b
+ rev32 lane1_msg_2_v.16b,lane1_msg_2_v.16b
+ rev32 lane2_msg_2_v.16b,lane2_msg_2_v.16b
+ rev32 lane3_msg_2_v.16b,lane3_msg_2_v.16b
+ rev32 lane0_msg_3_v.16b,lane0_msg_3_v.16b
+ rev32 lane1_msg_3_v.16b,lane1_msg_3_v.16b
+ rev32 lane2_msg_3_v.16b,lane2_msg_3_v.16b
+ rev32 lane3_msg_3_v.16b,lane3_msg_3_v.16b
+
+ add lane0_tmp1_v.4s,lane0_msg_1_v.4s,key_v.4s
+ add lane1_tmp1_v.4s,lane1_msg_1_v.4s,key_v.4s
+ add lane2_tmp1_v.4s,lane2_msg_1_v.4s,key_v.4s
+ add lane3_tmp1_v.4s,lane3_msg_1_v.4s,key_v.4s
+ st1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
+
+ add lane0_tmp0_v.4s,lane0_msg_0_v.4s,key_v.4s
+ add lane1_tmp0_v.4s,lane1_msg_0_v.4s,key_v.4s
+ add lane2_tmp0_v.4s,lane2_msg_0_v.4s,key_v.4s
+ add lane3_tmp0_v.4s,lane3_msg_0_v.4s,key_v.4s
+
+ /* rounds 0-3 */
+ sha1h lane0_tmp1_s,lane0_abcd_s
+ sha1h lane1_tmp1_s,lane1_abcd_s
+ sha1h lane2_tmp1_s,lane2_abcd_s
+ sha1h lane3_tmp1_s,lane3_abcd_s
+ mov e1_v.S[0],lane0_tmp1_v.S[0]
+ mov e1_v.S[1],lane1_tmp1_v.S[0]
+ mov e1_v.S[2],lane2_tmp1_v.S[0]
+ mov e1_v.S[3],lane3_tmp1_v.S[0]
+ mov lane0_tmp1_v.S[0],e0_v.S[0]
+ mov lane1_tmp1_v.S[0],e0_v.S[1]
+ mov lane2_tmp1_v.S[0],e0_v.S[2]
+ mov lane3_tmp1_v.S[0],e0_v.S[3]
+ sha1c lane0_abcd_q,lane0_tmp1_s,lane0_tmp0_v.4s
+ sha1c lane1_abcd_q,lane1_tmp1_s,lane1_tmp0_v.4s
+ sha1c lane2_abcd_q,lane2_tmp1_s,lane2_tmp0_v.4s
+ sha1c lane3_abcd_q,lane3_tmp1_s,lane3_tmp0_v.4s
+ ld1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
+ add lane0_tmp0_v.4s,lane0_msg_2_v.4s,key_v.4s
+ sha1su0 lane0_msg_0_v.4s,lane0_msg_1_v.4s,lane0_msg_2_v.4s
+ add lane1_tmp0_v.4s,lane1_msg_2_v.4s,key_v.4s
+ sha1su0 lane1_msg_0_v.4s,lane1_msg_1_v.4s,lane1_msg_2_v.4s
+ add lane2_tmp0_v.4s,lane2_msg_2_v.4s,key_v.4s
+ sha1su0 lane2_msg_0_v.4s,lane2_msg_1_v.4s,lane2_msg_2_v.4s
+ add lane3_tmp0_v.4s,lane3_msg_2_v.4s,key_v.4s
+ sha1su0 lane3_msg_0_v.4s,lane3_msg_1_v.4s,lane3_msg_2_v.4s
+ st1 {lane0_tmp0_v.4s-lane3_tmp0_v.4s},[tmp0_adr]
+
+ sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 4-7 */
+ sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+
+
+ adr key_adr,KEY_1
+ ldr key_q,[key_adr]
+ sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 /* rounds 12-15 */
+ sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 20-23 */
+ sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+
+ adr key_adr,KEY_2
+ ldr key_q,[key_adr]
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 36-39 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+ sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+
+ adr key_adr,KEY_3
+ ldr key_q,[key_adr]
+ sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 52-55 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+
+ //msg2 and msg1 are free
+ mov lane0_msg_2_v.S[0],e1_v.S[0]
+ mov lane1_msg_2_v.S[0],e1_v.S[1]
+ mov lane2_msg_2_v.S[0],e1_v.S[2]
+ mov lane3_msg_2_v.S[0],e1_v.S[3]
+
+ /* rounds 68-71 */
+ sha1h lane0_msg_1_s,lane0_abcd_s
+ sha1h lane1_msg_1_s,lane1_abcd_s
+ sha1h lane2_msg_1_s,lane2_abcd_s
+ sha1h lane3_msg_1_s,lane3_abcd_s
+ sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
+ sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
+ sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
+ sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
+ add lane0_tmp1_v.4s,lane0_msg_3_v.4s,key_v.4s
+ add lane1_tmp1_v.4s,lane1_msg_3_v.4s,key_v.4s
+ add lane2_tmp1_v.4s,lane2_msg_3_v.4s,key_v.4s
+ add lane3_tmp1_v.4s,lane3_msg_3_v.4s,key_v.4s
+ sha1su1 lane0_msg_0_v.4s,lane0_msg_3_v.4s
+ sha1su1 lane1_msg_0_v.4s,lane1_msg_3_v.4s
+ sha1su1 lane2_msg_0_v.4s,lane2_msg_3_v.4s
+ sha1su1 lane3_msg_0_v.4s,lane3_msg_3_v.4s
+
+ /* rounds 72-75 */
+ sha1h lane0_msg_2_s,lane0_abcd_s
+ sha1h lane1_msg_2_s,lane1_abcd_s
+ sha1h lane2_msg_2_s,lane2_abcd_s
+ sha1h lane3_msg_2_s,lane3_abcd_s
+ sha1p lane0_abcd_q,lane0_msg_1_s,lane0_tmp0_v.4s
+ sha1p lane1_abcd_q,lane1_msg_1_s,lane1_tmp0_v.4s
+ sha1p lane2_abcd_q,lane2_msg_1_s,lane2_tmp0_v.4s
+ sha1p lane3_abcd_q,lane3_msg_1_s,lane3_tmp0_v.4s
+
+ /* rounds 76-79 */
+ sha1h lane0_msg_1_s,lane0_abcd_s
+ sha1h lane1_msg_1_s,lane1_abcd_s
+ sha1h lane2_msg_1_s,lane2_abcd_s
+ sha1h lane3_msg_1_s,lane3_abcd_s
+ sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
+ sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
+ sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
+ sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
+ add digest_adr,digests,block_cnt
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[digest_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[digest_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[digest_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[digest_adr],offs
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[digest_adr]
+
+ add lane0_abcd_v.4S,lane0_abcd_v.4S,lane0_msg_0_v.4S
+ add lane1_abcd_v.4S,lane1_abcd_v.4S,lane1_msg_0_v.4S
+ add lane2_abcd_v.4S,lane2_abcd_v.4S,lane2_msg_0_v.4S
+ add lane3_abcd_v.4S,lane3_abcd_v.4S,lane3_msg_0_v.4S
+
+ add lane0_msg_1_v.4S,lane0_msg_1_v.4S,lane0_msg_3_v.4S
+ add lane1_msg_1_v.4S,lane1_msg_1_v.4S,lane1_msg_3_v.4S
+ add lane2_msg_1_v.4S,lane2_msg_1_v.4S,lane2_msg_3_v.4S
+ add lane3_msg_1_v.4S,lane3_msg_1_v.4S,lane3_msg_3_v.4S
+
+ add digest_adr,digests,block_cnt
+ st4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
+ st4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
+ st4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
+ st4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
+ st4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[digest_adr]
+
+ add block_cnt,block_cnt,16
+ cmp block_cnt,64
+ add msg_adr,input_data,block_cnt
+ add digest_adr,digests,block_cnt
+ bcc lane_loop
+
+ subs num_blocks,num_blocks,1
+ add input_data,input_data,1024
+ bhi start_loop
+exit_func:
+ //restore temp register
+ ldp d10, d11, [sp, 16]
+ ldp d12, d13, [sp, 32]
+ ldp d14, d15, [sp, 48]
+ ldp d8, d9, [sp], 128
+ ret
+
+ .size mh_sha1_block_ce, .-mh_sha1_block_ce
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 4
+KEY_0:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+KEY_1:
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+KEY_2:
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+KEY_3:
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c
new file mode 100644
index 000000000..c35daeab0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_ce.c
@@ -0,0 +1,53 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+void mh_sha1_block_ce(const uint8_t * input_data,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+/***************mh_sha1_update***********/
+// mh_sha1_update_ce.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_ce
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_ce
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+// mh_sha1_finalize_ce.c and mh_sha1_tail_ce.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_ce
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_ce
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_ce
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S
new file mode 100644
index 000000000..9a6d0caea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/mh_sha1_multibinary.S
@@ -0,0 +1,35 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface mh_sha1_update
+mbin_interface mh_sha1_finalize
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S
new file mode 100644
index 000000000..c8b8dd982
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/aarch64/sha1_asimd_common.S
@@ -0,0 +1,269 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+ eor VF.16b, VC.16b, VD.16b
+ and VF.16b, VB.16b, VF.16b
+ eor VF.16b, VD.16b, VF.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+ eor VF.16b, VB.16b, VC.16b
+ eor VF.16b, VF.16b, VD.16b
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+ and vT0.16b, VB.16b, VC.16b
+ and vT1.16b, VB.16b, VD.16b
+ and vT2.16b, VC.16b, VD.16b
+ orr VF.16b, vT0.16b, vT1.16b
+ orr VF.16b, VF.16b, vT2.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+ FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+ .if \windex < 16
+ load_x4_word \windex
+ .endif
+.endm
+
+// FUNC_F0 is merged into STEP_00_15 for efficiency
+.macro SHA1_STEP_00_15_F0 windex:req
+ rev32 WORD\windex\().16b,WORD\windex\().16b
+ next_word=\windex+1
+ load_next_word %next_word
+ // e = (a leftrotate 5) + f + e + k + w[i]
+ ushr VT.4s, VA.4s, 32 - 5
+ add VE.4s, VE.4s, VK.4s
+ sli VT.4s, VA.4s, 5
+ eor VF.16b, VC.16b, VD.16b
+ add VE.4s, VE.4s, WORD\windex\().4s
+ and VF.16b, VB.16b, VF.16b
+ add VE.4s, VE.4s, VT.4s
+ eor VF.16b, VD.16b, VF.16b
+ ushr VT.4s, VB.4s, 32 - 30
+ add VE.4s, VE.4s, VF.4s
+ sli VT.4s, VB.4s, 30
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+ eor vT0.16b,\reg_3\().16b,\reg_8\().16b
+ eor VT.16b,\reg_14\().16b,\reg_16\().16b
+ eor vT0.16b,vT0.16b,VT.16b
+ // e = (a leftrotate 5) + f + e + k + w[i]
+ ushr VT.4s, vT0.4s, 32 - 1
+ add VE.4s, VE.4s, VK.4s
+ ushr vT1.4s, VA.4s, 32 - 5
+ sli VT.4s, vT0.4s, 1
+ add VE.4s, VE.4s, VT.4s
+ sli vT1.4s, VA.4s, 5
+ mov \reg_16\().16b,VT.16b
+ add VE.4s, VE.4s, vT1.4s
+ ushr VT.4s, VB.4s, 32 - 30
+ \func_f
+ add VE.4s, VE.4s, VF.4s
+ sli VT.4s, VB.4s, 30
+.endm
+
+ VA .req v0
+ VB .req v1
+ VC .req v2
+ VD .req v3
+ VE .req v4
+ VT .req v5
+ VF .req v6
+ VK .req v7
+ WORD0 .req v8
+ WORD1 .req v9
+ WORD2 .req v10
+ WORD3 .req v11
+ WORD4 .req v12
+ WORD5 .req v13
+ WORD6 .req v14
+ WORD7 .req v15
+ WORD8 .req v16
+ WORD9 .req v17
+ WORD10 .req v18
+ WORD11 .req v19
+ WORD12 .req v20
+ WORD13 .req v21
+ WORD14 .req v22
+ WORD15 .req v23
+ vT0 .req v24
+ vT1 .req v25
+ vT2 .req v26
+ vAA .req v27
+ vBB .req v28
+ vCC .req v29
+ vDD .req v30
+ vEE .req v31
+ TT .req v0
+ sha1key_adr .req x15
+
+.macro SWAP_STATES
+ // shifted VB is held in VT after each step
+ .unreq TT
+ TT .req VE
+ .unreq VE
+ VE .req VD
+ .unreq VD
+ VD .req VC
+ .unreq VC
+ VC .req VT
+ .unreq VT
+ VT .req VB
+ .unreq VB
+ VB .req VA
+ .unreq VA
+ VA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+ SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+ .if \windex <= 15
+ SHA1_STEP_00_15_F0 windex
+ .else
+ idx14=((\windex - 14) & 15)
+ idx8=((\windex - 8) & 15)
+ idx3=((\windex - 3) & 15)
+ idx16=(\windex & 15)
+ .if \windex <= 19
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 20 && \windex <= 39
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 40 && \windex <= 59
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 60 && \windex <= 79
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .endif
+
+ SWAP_STATES
+
+ .if \windex == 79
+ // after 80 steps, the registers ABCDET has shifted from
+ // its orignal order of 012345 to 341520
+ // have to swap back for both compile- and run-time correctness
+ mov v0.16b,v3.16b
+ .unreq VA
+ VA .req v0
+
+ mov vT0.16b,v2.16b
+ mov v2.16b,v1.16b
+ mov v1.16b,v4.16b
+ .unreq VB
+ VB .req v1
+ .unreq VC
+ VC .req v2
+
+ mov v3.16b,v5.16b
+ .unreq VD
+ VD .req v3
+
+ mov v4.16b,vT0.16b
+ .unreq VE
+ VE .req v4
+
+ .unreq VT
+ VT .req v5
+ .endif
+.endm
+
+.macro exec_steps idx:req,more:vararg
+ exec_step \idx
+ .ifnb \more
+ exec_steps \more
+ .endif
+.endm
+
+.macro sha1_single
+ load_x4_word 0
+
+ mov vAA.16B, VA.16B
+ mov vBB.16B, VB.16B
+ mov vCC.16B, VC.16B
+ mov vDD.16B, VD.16B
+ mov vEE.16B, VE.16B
+
+ adr sha1key_adr, KEY_0
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+ // 20 ~ 39
+ adr sha1key_adr, KEY_1
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+ // 40 ~ 59
+ adr sha1key_adr, KEY_2
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+ // 60 ~ 79
+ adr sha1key_adr, KEY_3
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+ add VA.4s, vAA.4s, VA.4s
+ add VB.4s, vBB.4s, VB.4s
+ add VC.4s, vCC.4s, VC.4s
+ add VD.4s, vDD.4s, VD.4s
+ add VE.4s, vEE.4s, VE.4s
+.endm
+
+.macro sha1_asimd_save_stack
+ stp d8,d9,[sp, -64]!
+ stp d10,d11,[sp, 16]
+ stp d12,d13,[sp, 32]
+ stp d14,d15,[sp, 48]
+.endm
+
+.macro sha1_asimd_restore_stack
+ ldp d10,d11,[sp, 16]
+ ldp d12,d13,[sp, 32]
+ ldp d14,d15,[sp, 48]
+ ldp d8,d9,[sp],64
+.endm
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c
new file mode 100644
index 000000000..e5d8ad86d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c
@@ -0,0 +1,141 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+int mh_sha1_init(struct mh_sha1_ctx *ctx)
+{
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint32_t i;
+
+ if (ctx == NULL)
+ return MH_SHA1_CTX_ERROR_NULL;
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha1_segs_digests[0][i] = MH_SHA1_H0;
+ mh_sha1_segs_digests[1][i] = MH_SHA1_H1;
+ mh_sha1_segs_digests[2][i] = MH_SHA1_H2;
+ mh_sha1_segs_digests[3][i] = MH_SHA1_H3;
+ mh_sha1_segs_digests[4][i] = MH_SHA1_H4;
+ }
+
+ return MH_SHA1_CTX_ERROR_NONE;
+}
+
+#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \
+ || defined( _M_X64) || defined(_M_IX86))
+/***************mh_sha1_update***********/
+// mh_sha1_update_sse.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_sse
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_sse
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_update_avx.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_update_avx2.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx2
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx2
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+
+// mh_sha1_finalize_sse.c and mh_sha1_tail_sse.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_sse
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_sse
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_sse
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_finalize_avx.c and mh_sha1_tail_avx.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_avx
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_finalize_avx2.c and mh_sha1_tail_avx2.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_avx2
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx2
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx2
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+// Version info
+struct slver mh_sha1_init_slver_00000271;
+struct slver mh_sha1_init_slver = { 0x0271, 0x00, 0x00 };
+
+// mh_sha1_update version info
+struct slver mh_sha1_update_sse_slver_00000274;
+struct slver mh_sha1_update_sse_slver = { 0x0274, 0x00, 0x00 };
+
+struct slver mh_sha1_update_avx_slver_02000276;
+struct slver mh_sha1_update_avx_slver = { 0x0276, 0x00, 0x02 };
+
+struct slver mh_sha1_update_avx2_slver_04000278;
+struct slver mh_sha1_update_avx2_slver = { 0x0278, 0x00, 0x04 };
+
+// mh_sha1_finalize version info
+struct slver mh_sha1_finalize_sse_slver_00000275;
+struct slver mh_sha1_finalize_sse_slver = { 0x0275, 0x00, 0x00 };
+
+struct slver mh_sha1_finalize_avx_slver_02000277;
+struct slver mh_sha1_finalize_avx_slver = { 0x0277, 0x00, 0x02 };
+
+struct slver mh_sha1_finalize_avx2_slver_04000279;
+struct slver mh_sha1_finalize_avx2_slver = { 0x0279, 0x00, 0x04 };
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c
new file mode 100644
index 000000000..1305d048f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c
@@ -0,0 +1,70 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+/***************mh_sha1_update***********/
+// mh_sha1_update_avx512.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx512
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx512
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+// mh_sha1_finalize_avx512.c and mh_sha1_tail_avx512.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_avx512
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx512
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx512
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************version info***********/
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// mh_sha1_update version info
+struct slver mh_sha1_update_avx512_slver_0600027c;
+struct slver mh_sha1_update_avx512_slver = { 0x027c, 0x00, 0x06 };
+
+// mh_sha1_finalize version info
+struct slver mh_sha1_finalize_avx512_slver_0600027d;
+struct slver mh_sha1_finalize_avx512_slver = { 0x027d, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c
new file mode 100644
index 000000000..18cd8161b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_base_aliases.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "mh_sha1_internal.h"
+#include <string.h>
+int mh_sha1_update(struct mh_sha1_ctx *ctx, const void *buffer, uint32_t len)
+{
+ return mh_sha1_update_base(ctx, buffer, len);
+
+}
+
+int mh_sha1_finalize(struct mh_sha1_ctx *ctx, void *mh_sha1_digest)
+{
+ return mh_sha1_finalize_base(ctx, mh_sha1_digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm
new file mode 100644
index 000000000..f4b5e76a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm
@@ -0,0 +1,506 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-(%%imm))
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-(%%imm))
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[%%data + (%%memW * 16)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+
+;void mh_sha1_block_avx(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+mk_global mh_sha1_block_avx, function, internal
+func(mh_sha1_block_avx)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by avx
+ and rsp, ~0x0F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 5
+ VMOVPS A, [mh_digests_p + I*64 + 16*0]
+ VMOVPS B, [mh_digests_p + I*64 + 16*1]
+ VMOVPS C, [mh_digests_p + I*64 + 16*2]
+ VMOVPS D, [mh_digests_p + I*64 + 16*3]
+
+ vmovdqa [rsp + I*64 + 16*0], A
+ vmovdqa [rsp + I*64 + 16*1], B
+ vmovdqa [rsp + I*64 + 16*2], C
+ vmovdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+ VMOVPS T0,[mh_in_p + I*64+0*16]
+ VMOVPS T1,[mh_in_p + I*64+1*16]
+ VMOVPS T2,[mh_in_p + I*64+2*16]
+ VMOVPS T3,[mh_in_p + I*64+3*16]
+
+ vpshufb T0, F
+ vmovdqa [mh_data_p +(I)*16 +0*256],T0
+ vpshufb T1, F
+ vmovdqa [mh_data_p +(I)*16 +1*256],T1
+ vpshufb T2, F
+ vmovdqa [mh_data_p +(I)*16 +2*256],T2
+ vpshufb T3, F
+ vmovdqa [mh_data_p +(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ ;; Initialize digests
+ vmovdqa A, [rsp + 0*64 + mh_segs]
+ vmovdqa B, [rsp + 1*64 + mh_segs]
+ vmovdqa C, [rsp + 2*64 + mh_segs]
+ vmovdqa D, [rsp + 3*64 + mh_segs]
+ vmovdqa E, [rsp + 4*64 + mh_segs]
+
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+ vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ vpaddd A, AA
+ vpaddd B, BB
+ vpaddd C, CC
+ vpaddd D, DD
+ vpaddd E, EE
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], A
+ vmovdqa [rsp + 1*64 + mh_segs], B
+ vmovdqa [rsp + 2*64 + mh_segs], C
+ vmovdqa [rsp + 3*64 + mh_segs], D
+ vmovdqa [rsp + 4*64 + mh_segs], E
+
+ add pref, 256
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 5
+ vmovdqa A, [rsp + I*64 + 16*0]
+ vmovdqa B, [rsp + I*64 + 16*1]
+ vmovdqa C, [rsp + I*64 + 16*2]
+ vmovdqa D, [rsp + I*64 + 16*3]
+
+ VMOVPS [mh_digests_p + I*64 + 16*0], A
+ VMOVPS [mh_digests_p + I*64 + 16*1], B
+ VMOVPS [mh_digests_p + I*64 + 16*2], C
+ VMOVPS [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm
new file mode 100644
index 000000000..fed35d83e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm
@@ -0,0 +1,508 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX-2
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | ((~ B) & D) )
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpand %%regF, %%regB,%%regC
+ vpandn %%regT, %%regB,%%regD
+ vpor %%regF, %%regT,%%regF
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-%%imm)
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[%%data + (%%memW * 32)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+
+%define F ymm5
+%define T0 ymm6
+%define T1 ymm7
+%define T2 ymm8
+%define T3 ymm9
+%define T4 ymm10
+%define T5 ymm11
+%define T6 ymm12
+%define T7 ymm13
+%define T8 ymm14
+%define T9 ymm15
+
+%define AA ymm5
+%define BB ymm6
+%define CC ymm7
+%define DD ymm8
+%define EE ymm9
+%define TMP ymm10
+%define FUN ymm11
+%define K ymm12
+%define W14 ymm13
+%define W15 ymm14
+%define W16 ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+
+;void mh_sha1_block_avx2(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+mk_global mh_sha1_block_avx2, function, internal
+func(mh_sha1_block_avx2)
+ endbranch
+ FUNC_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ test loops, loops
+ jz .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 32 Bytes needed by avx2
+ and rsp, ~0x1F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 2
+ VMOVPS A, [mh_digests_p + I*32*5 + 32*0]
+ VMOVPS B, [mh_digests_p + I*32*5 + 32*1]
+ VMOVPS C, [mh_digests_p + I*32*5 + 32*2]
+ VMOVPS D, [mh_digests_p + I*32*5 + 32*3]
+ VMOVPS E, [mh_digests_p + I*32*5 + 32*4]
+
+ vmovdqa [rsp + I*32*5 + 32*0], A
+ vmovdqa [rsp + I*32*5 + 32*1], B
+ vmovdqa [rsp + I*32*5 + 32*2], C
+ vmovdqa [rsp + I*32*5 + 32*3], D
+ vmovdqa [rsp + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vbroadcasti128 F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
+%assign I 0
+%rep 16
+ VMOVPS T0,[mh_in_p + I*64+0*32]
+ VMOVPS T1,[mh_in_p + I*64+1*32]
+
+ vpshufb T0, T0, F
+ vmovdqa [mh_data_p +I*32+0*512],T0
+ vpshufb T1, T1, F
+ vmovdqa [mh_data_p +I*32+1*512],T1
+%assign I (I+1)
+%endrep
+
+ xor mh_segs, mh_segs ;start from the first 8 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ ;; Initialize digests
+ vmovdqa A, [rsp + 0*64 + mh_segs]
+ vmovdqa B, [rsp + 1*64 + mh_segs]
+ vmovdqa C, [rsp + 2*64 + mh_segs]
+ vmovdqa D, [rsp + 3*64 + mh_segs]
+ vmovdqa E, [rsp + 4*64 + mh_segs]
+
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ vpbroadcastq K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 32]
+ vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 32]
+ %rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+ PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 20...39
+ vpbroadcastq K, [K20_39]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+;; do rounds 40...59
+ vpbroadcastq K, [K40_59]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*2]
+ PREFETCH_X [mh_in_p + pref+128*3]
+;; do rounds 60...79
+ vpbroadcastq K, [K60_79]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ vpaddd A,A, AA
+ vpaddd B,B, BB
+ vpaddd C,C, CC
+ vpaddd D,D, DD
+ vpaddd E,E, EE
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], A
+ vmovdqa [rsp + 1*64 + mh_segs], B
+ vmovdqa [rsp + 2*64 + mh_segs], C
+ vmovdqa [rsp + 3*64 + mh_segs], D
+ vmovdqa [rsp + 4*64 + mh_segs], E
+
+ add pref, 512
+
+ add mh_data_p, 512
+ add mh_segs, 32
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 2
+ vmovdqa A, [rsp + I*32*5 + 32*0]
+ vmovdqa B, [rsp + I*32*5 + 32*1]
+ vmovdqa C, [rsp + I*32*5 + 32*2]
+ vmovdqa D, [rsp + I*32*5 + 32*3]
+ vmovdqa E, [rsp + I*32*5 + 32*4]
+
+ VMOVPS [mh_digests_p + I*32*5 + 32*0], A
+ VMOVPS [mh_digests_p + I*32*5 + 32*1], B
+ VMOVPS [mh_digests_p + I*32*5 + 32*2], C
+ VMOVPS [mh_digests_p + I*32*5 + 32*3], D
+ VMOVPS [mh_digests_p + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .rodata align=32
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm
new file mode 100644
index 000000000..a72c21661
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm
@@ -0,0 +1,406 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovdqu64
+;SIMD variables definition
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define HH0 zmm5
+%define HH1 zmm6
+%define HH2 zmm7
+%define HH3 zmm8
+%define HH4 zmm9
+%define KT zmm10
+%define XTMP0 zmm11
+%define XTMP1 zmm12
+%define SHUF_MASK zmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;using extra 16 ZMM registers to place the inverse input data
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;macros definition
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ vmovdqa32 XTMP1, B ; Copy B
+ vpaddd E, E, %%WT ; E = E + Wt
+ vpternlogd XTMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ vprold XTMP0, A, 5 ; TMP0 = ROTL_5(A)
+ vpaddd E, E, XTMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ vprold B, B, 30 ; B = ROTL_30(B)
+ vpaddd E, E, XTMP0 ; E = T
+
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp2 %2
+%define %%WTp8 %3
+%define %%WTp13 %4
+ ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+ ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+ vpternlogd %%WT, %%WTp2, %%WTp8, 0x96
+ vpxord %%WT, %%WT, %%WTp13
+ vprold %%WT, %%WT, 1
+%endmacro
+
+%define APPEND(a,b) a %+ b
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ ; remove unwind info macros
+ %define func(x) x: endbranch
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ movdqa [rsp + 0*16], xmm6
+ movdqa [rsp + 1*16], xmm7
+ movdqa [rsp + 2*16], xmm8
+ movdqa [rsp + 3*16], xmm9
+ movdqa [rsp + 4*16], xmm10
+ movdqa [rsp + 5*16], xmm11
+ movdqa [rsp + 6*16], xmm12
+ movdqa [rsp + 7*16], xmm13
+ movdqa [rsp + 8*16], xmm14
+ movdqa [rsp + 9*16], xmm15
+ mov [rsp + 10*16 + 0*8], r12
+ mov [rsp + 10*16 + 1*8], r13
+ mov [rsp + 10*16 + 2*8], r14
+ mov [rsp + 10*16 + 3*8], r15
+ mov [rsp + 10*16 + 4*8], rdi
+ mov [rsp + 10*16 + 5*8], rsi
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha1_block_avx512(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+global mh_sha1_block_avx512
+func(mh_sha1_block_avx512)
+ endbranch
+ FUNC_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; align rsp to 64 Bytes needed by avx512
+ and rsp, ~0x3f
+
+ ; copy segs_digests into registers.
+ VMOVPS HH0, [mh_digests_p + 64*0]
+ VMOVPS HH1, [mh_digests_p + 64*1]
+ VMOVPS HH2, [mh_digests_p + 64*2]
+ VMOVPS HH3, [mh_digests_p + 64*3]
+ VMOVPS HH4, [mh_digests_p + 64*4]
+ ;a mask used to transform to big-endian data
+ vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ ;using extra 16 ZMM registers instead of stack
+%assign I 0
+%rep 8
+%assign J (I+1)
+ VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64]
+ VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64]
+
+ vpshufb APPEND(W,I), APPEND(W,I), SHUF_MASK
+ vpshufb APPEND(W,J), APPEND(W,J), SHUF_MASK
+%assign I (I+2)
+%endrep
+
+ vmovdqa64 A, HH0
+ vmovdqa64 B, HH1
+ vmovdqa64 C, HH2
+ vmovdqa64 D, HH3
+ vmovdqa64 E, HH4
+
+ vmovdqa32 KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 80
+ PROCESS_LOOP APPEND(W,J), I
+ %if N < 64
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %endif
+ %if N = 19
+ vmovdqa32 KT, [K20_39]
+ %assign I 0x96
+ %elif N = 39
+ vmovdqa32 KT, [K40_59]
+ %assign I 0xE8
+ %elif N = 59
+ vmovdqa32 KT, [K60_79]
+ %assign I 0x96
+ %endif
+ %if N % 10 = 9
+ PREFETCH_X [mh_in_p + 1024+128*(N / 10)]
+ %endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+ ; Add old digest
+ vpaddd HH0,A, HH0
+ vpaddd HH1,B, HH1
+ vpaddd HH2,C, HH2
+ vpaddd HH3,D, HH3
+ vpaddd HH4,E, HH4
+
+ add mh_in_p, 1024
+ sub loops, 1
+ jne .block_loop
+
+ ; copy segs_digests to mh_digests_p
+ VMOVPS [mh_digests_p + 64*0], HH0
+ VMOVPS [mh_digests_p + 64*1], HH1
+ VMOVPS [mh_digests_p + 64*2], HH2
+ VMOVPS [mh_digests_p + 64*3], HH3
+ VMOVPS [mh_digests_p + 64*4], HH4
+
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+
+section .data align=64
+
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+
+K20_39: dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+
+K40_59: dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+
+K60_79: dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_mh_sha1_block_avx512
+no_mh_sha1_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c
new file mode 100644
index 000000000..402c9741a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c
@@ -0,0 +1,387 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha1_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Base multi-hash SHA1 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+#define store_w(s, i, w, ww) (w[i][s] = to_be32(ww[i*HASH_SEGS+s])) // only used for step 0 ~ 15
+#define update_w(s, i, w) (w[i&15][s] = rol32(w[(i-3)&15][s]^w[(i-8)&15][s]^w[(i-14)&15][s]^w[(i-16)&15][s], 1)) // used for step > 15
+#define update_e_1(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F1(b[s],c[s],d[s]) + K_00_19 + w[i&15][s])
+#define update_e_2(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F2(b[s],c[s],d[s]) + K_20_39 + w[i&15][s])
+#define update_e_3(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F3(b[s],c[s],d[s]) + K_40_59 + w[i&15][s])
+#define update_e_4(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F4(b[s],c[s],d[s]) + K_60_79 + w[i&15][s])
+#define update_b(s, b) (b[s] = rol32(b[s],30))
+
+#define STORE_W(i, w, ww) \
+ store_w(0, i, w, ww); \
+ store_w(1, i, w, ww); \
+ store_w(2, i, w, ww); \
+ store_w(3, i, w, ww); \
+ store_w(4, i, w, ww); \
+ store_w(5, i, w, ww); \
+ store_w(6, i, w, ww); \
+ store_w(7, i, w, ww); \
+ store_w(8, i, w, ww); \
+ store_w(9, i, w, ww); \
+ store_w(10, i, w, ww); \
+ store_w(11, i, w, ww); \
+ store_w(12, i, w, ww); \
+ store_w(13, i, w, ww); \
+ store_w(14, i, w, ww); \
+ store_w(15, i, w, ww)
+
+#define UPDATE_W(i, w) \
+ update_w(0, i, w); \
+ update_w(1, i, w); \
+ update_w(2, i, w); \
+ update_w(3, i, w); \
+ update_w(4, i, w); \
+ update_w(5, i, w); \
+ update_w(6, i, w); \
+ update_w(7, i, w); \
+ update_w(8, i, w); \
+ update_w(9, i, w); \
+ update_w(10, i, w); \
+ update_w(11, i, w); \
+ update_w(12, i, w); \
+ update_w(13, i, w); \
+ update_w(14, i, w); \
+ update_w(15, i, w)
+
+#define UPDATE_E1(a, b, c, d, e, i, w) \
+ update_e_1(0, a, b, c, d, e, i, w); \
+ update_e_1(1, a, b, c, d, e, i, w); \
+ update_e_1(2, a, b, c, d, e, i, w); \
+ update_e_1(3, a, b, c, d, e, i, w); \
+ update_e_1(4, a, b, c, d, e, i, w); \
+ update_e_1(5, a, b, c, d, e, i, w); \
+ update_e_1(6, a, b, c, d, e, i, w); \
+ update_e_1(7, a, b, c, d, e, i, w); \
+ update_e_1(8, a, b, c, d, e, i, w); \
+ update_e_1(9, a, b, c, d, e, i, w); \
+ update_e_1(10, a, b, c, d, e, i, w); \
+ update_e_1(11, a, b, c, d, e, i, w); \
+ update_e_1(12, a, b, c, d, e, i, w); \
+ update_e_1(13, a, b, c, d, e, i, w); \
+ update_e_1(14, a, b, c, d, e, i, w); \
+ update_e_1(15, a, b, c, d, e, i, w)
+
+#define UPDATE_E2(a, b, c, d, e, i, w) \
+ update_e_2(0, a, b, c, d, e, i, w); \
+ update_e_2(1, a, b, c, d, e, i, w); \
+ update_e_2(2, a, b, c, d, e, i, w); \
+ update_e_2(3, a, b, c, d, e, i, w); \
+ update_e_2(4, a, b, c, d, e, i, w); \
+ update_e_2(5, a, b, c, d, e, i, w); \
+ update_e_2(6, a, b, c, d, e, i, w); \
+ update_e_2(7, a, b, c, d, e, i, w); \
+ update_e_2(8, a, b, c, d, e, i, w); \
+ update_e_2(9, a, b, c, d, e, i, w); \
+ update_e_2(10, a, b, c, d, e, i, w); \
+ update_e_2(11, a, b, c, d, e, i, w); \
+ update_e_2(12, a, b, c, d, e, i, w); \
+ update_e_2(13, a, b, c, d, e, i, w); \
+ update_e_2(14, a, b, c, d, e, i, w); \
+ update_e_2(15, a, b, c, d, e, i, w)
+
+#define UPDATE_E3(a, b, c, d, e, i, w) \
+ update_e_3(0, a, b, c, d, e, i, w); \
+ update_e_3(1, a, b, c, d, e, i, w); \
+ update_e_3(2, a, b, c, d, e, i, w); \
+ update_e_3(3, a, b, c, d, e, i, w); \
+ update_e_3(4, a, b, c, d, e, i, w); \
+ update_e_3(5, a, b, c, d, e, i, w); \
+ update_e_3(6, a, b, c, d, e, i, w); \
+ update_e_3(7, a, b, c, d, e, i, w); \
+ update_e_3(8, a, b, c, d, e, i, w); \
+ update_e_3(9, a, b, c, d, e, i, w); \
+ update_e_3(10, a, b, c, d, e, i, w); \
+ update_e_3(11, a, b, c, d, e, i, w); \
+ update_e_3(12, a, b, c, d, e, i, w); \
+ update_e_3(13, a, b, c, d, e, i, w); \
+ update_e_3(14, a, b, c, d, e, i, w); \
+ update_e_3(15, a, b, c, d, e, i, w)
+
+#define UPDATE_E4(a, b, c, d, e, i, w) \
+ update_e_4(0, a, b, c, d, e, i, w); \
+ update_e_4(1, a, b, c, d, e, i, w); \
+ update_e_4(2, a, b, c, d, e, i, w); \
+ update_e_4(3, a, b, c, d, e, i, w); \
+ update_e_4(4, a, b, c, d, e, i, w); \
+ update_e_4(5, a, b, c, d, e, i, w); \
+ update_e_4(6, a, b, c, d, e, i, w); \
+ update_e_4(7, a, b, c, d, e, i, w); \
+ update_e_4(8, a, b, c, d, e, i, w); \
+ update_e_4(9, a, b, c, d, e, i, w); \
+ update_e_4(10, a, b, c, d, e, i, w); \
+ update_e_4(11, a, b, c, d, e, i, w); \
+ update_e_4(12, a, b, c, d, e, i, w); \
+ update_e_4(13, a, b, c, d, e, i, w); \
+ update_e_4(14, a, b, c, d, e, i, w); \
+ update_e_4(15, a, b, c, d, e, i, w)
+
+#define UPDATE_B(b) \
+ update_b(0, b); \
+ update_b(1, b); \
+ update_b(2, b); \
+ update_b(3, b); \
+ update_b(4, b); \
+ update_b(5, b); \
+ update_b(6, b); \
+ update_b(7, b); \
+ update_b(8, b); \
+ update_b(9, b); \
+ update_b(10, b); \
+ update_b(11, b); \
+ update_b(12, b); \
+ update_b(13, b); \
+ update_b(14, b); \
+ update_b(15, b)
+
+static inline void step00_15(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS],
+ uint32_t * ww)
+{
+ STORE_W(i, w, ww);
+ UPDATE_E1(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+}
+
+static inline void step16_19(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+ UPDATE_W(i, w);
+ UPDATE_E1(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+
+}
+
+static inline void step20_39(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+ UPDATE_W(i, w);
+ UPDATE_E2(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+}
+
+static inline void step40_59(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+ UPDATE_W(i, w);
+ UPDATE_E3(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+}
+
+static inline void step60_79(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+ UPDATE_W(i, w);
+ UPDATE_E4(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+}
+
+static inline void init_abcde(uint32_t * xx, uint32_t n,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS])
+{
+ xx[0] = digests[n][0];
+ xx[1] = digests[n][1];
+ xx[2] = digests[n][2];
+ xx[3] = digests[n][3];
+ xx[4] = digests[n][4];
+ xx[5] = digests[n][5];
+ xx[6] = digests[n][6];
+ xx[7] = digests[n][7];
+ xx[8] = digests[n][8];
+ xx[9] = digests[n][9];
+ xx[10] = digests[n][10];
+ xx[11] = digests[n][11];
+ xx[12] = digests[n][12];
+ xx[13] = digests[n][13];
+ xx[14] = digests[n][14];
+ xx[15] = digests[n][15];
+}
+
+static inline void add_abcde(uint32_t * xx, uint32_t n,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS])
+{
+ digests[n][0] += xx[0];
+ digests[n][1] += xx[1];
+ digests[n][2] += xx[2];
+ digests[n][3] += xx[3];
+ digests[n][4] += xx[4];
+ digests[n][5] += xx[5];
+ digests[n][6] += xx[6];
+ digests[n][7] += xx[7];
+ digests[n][8] += xx[8];
+ digests[n][9] += xx[9];
+ digests[n][10] += xx[10];
+ digests[n][11] += xx[11];
+ digests[n][12] += xx[12];
+ digests[n][13] += xx[13];
+ digests[n][14] += xx[14];
+ digests[n][15] += xx[15];
+}
+
+/*
+ * API to perform 0-79 steps of the multi-hash algorithm for
+ * a single block of data. The caller is responsible for ensuring
+ * a full block of data input.
+ *
+ * Argument:
+ * input - the pointer to the data
+ * digest - the space to hold the digests for all segments.
+ *
+ * Return:
+ * N/A
+ */
+void mh_sha1_single(const uint8_t * input, uint32_t(*digests)[HASH_SEGS],
+ uint8_t * frame_buffer)
+{
+ uint32_t aa[HASH_SEGS], bb[HASH_SEGS], cc[HASH_SEGS], dd[HASH_SEGS], ee[HASH_SEGS];
+ uint32_t *ww = (uint32_t *) input;
+ uint32_t(*w)[HASH_SEGS];
+
+ w = (uint32_t(*)[HASH_SEGS]) frame_buffer;
+
+ init_abcde(aa, 0, digests);
+ init_abcde(bb, 1, digests);
+ init_abcde(cc, 2, digests);
+ init_abcde(dd, 3, digests);
+ init_abcde(ee, 4, digests);
+
+ step00_15(0, aa, bb, cc, dd, ee, w, ww);
+ step00_15(1, ee, aa, bb, cc, dd, w, ww);
+ step00_15(2, dd, ee, aa, bb, cc, w, ww);
+ step00_15(3, cc, dd, ee, aa, bb, w, ww);
+ step00_15(4, bb, cc, dd, ee, aa, w, ww);
+ step00_15(5, aa, bb, cc, dd, ee, w, ww);
+ step00_15(6, ee, aa, bb, cc, dd, w, ww);
+ step00_15(7, dd, ee, aa, bb, cc, w, ww);
+ step00_15(8, cc, dd, ee, aa, bb, w, ww);
+ step00_15(9, bb, cc, dd, ee, aa, w, ww);
+ step00_15(10, aa, bb, cc, dd, ee, w, ww);
+ step00_15(11, ee, aa, bb, cc, dd, w, ww);
+ step00_15(12, dd, ee, aa, bb, cc, w, ww);
+ step00_15(13, cc, dd, ee, aa, bb, w, ww);
+ step00_15(14, bb, cc, dd, ee, aa, w, ww);
+ step00_15(15, aa, bb, cc, dd, ee, w, ww);
+
+ step16_19(16, ee, aa, bb, cc, dd, w);
+ step16_19(17, dd, ee, aa, bb, cc, w);
+ step16_19(18, cc, dd, ee, aa, bb, w);
+ step16_19(19, bb, cc, dd, ee, aa, w);
+
+ step20_39(20, aa, bb, cc, dd, ee, w);
+ step20_39(21, ee, aa, bb, cc, dd, w);
+ step20_39(22, dd, ee, aa, bb, cc, w);
+ step20_39(23, cc, dd, ee, aa, bb, w);
+ step20_39(24, bb, cc, dd, ee, aa, w);
+ step20_39(25, aa, bb, cc, dd, ee, w);
+ step20_39(26, ee, aa, bb, cc, dd, w);
+ step20_39(27, dd, ee, aa, bb, cc, w);
+ step20_39(28, cc, dd, ee, aa, bb, w);
+ step20_39(29, bb, cc, dd, ee, aa, w);
+ step20_39(30, aa, bb, cc, dd, ee, w);
+ step20_39(31, ee, aa, bb, cc, dd, w);
+ step20_39(32, dd, ee, aa, bb, cc, w);
+ step20_39(33, cc, dd, ee, aa, bb, w);
+ step20_39(34, bb, cc, dd, ee, aa, w);
+ step20_39(35, aa, bb, cc, dd, ee, w);
+ step20_39(36, ee, aa, bb, cc, dd, w);
+ step20_39(37, dd, ee, aa, bb, cc, w);
+ step20_39(38, cc, dd, ee, aa, bb, w);
+ step20_39(39, bb, cc, dd, ee, aa, w);
+
+ step40_59(40, aa, bb, cc, dd, ee, w);
+ step40_59(41, ee, aa, bb, cc, dd, w);
+ step40_59(42, dd, ee, aa, bb, cc, w);
+ step40_59(43, cc, dd, ee, aa, bb, w);
+ step40_59(44, bb, cc, dd, ee, aa, w);
+ step40_59(45, aa, bb, cc, dd, ee, w);
+ step40_59(46, ee, aa, bb, cc, dd, w);
+ step40_59(47, dd, ee, aa, bb, cc, w);
+ step40_59(48, cc, dd, ee, aa, bb, w);
+ step40_59(49, bb, cc, dd, ee, aa, w);
+ step40_59(50, aa, bb, cc, dd, ee, w);
+ step40_59(51, ee, aa, bb, cc, dd, w);
+ step40_59(52, dd, ee, aa, bb, cc, w);
+ step40_59(53, cc, dd, ee, aa, bb, w);
+ step40_59(54, bb, cc, dd, ee, aa, w);
+ step40_59(55, aa, bb, cc, dd, ee, w);
+ step40_59(56, ee, aa, bb, cc, dd, w);
+ step40_59(57, dd, ee, aa, bb, cc, w);
+ step40_59(58, cc, dd, ee, aa, bb, w);
+ step40_59(59, bb, cc, dd, ee, aa, w);
+
+ step60_79(60, aa, bb, cc, dd, ee, w);
+ step60_79(61, ee, aa, bb, cc, dd, w);
+ step60_79(62, dd, ee, aa, bb, cc, w);
+ step60_79(63, cc, dd, ee, aa, bb, w);
+ step60_79(64, bb, cc, dd, ee, aa, w);
+ step60_79(65, aa, bb, cc, dd, ee, w);
+ step60_79(66, ee, aa, bb, cc, dd, w);
+ step60_79(67, dd, ee, aa, bb, cc, w);
+ step60_79(68, cc, dd, ee, aa, bb, w);
+ step60_79(69, bb, cc, dd, ee, aa, w);
+ step60_79(70, aa, bb, cc, dd, ee, w);
+ step60_79(71, ee, aa, bb, cc, dd, w);
+ step60_79(72, dd, ee, aa, bb, cc, w);
+ step60_79(73, cc, dd, ee, aa, bb, w);
+ step60_79(74, bb, cc, dd, ee, aa, w);
+ step60_79(75, aa, bb, cc, dd, ee, w);
+ step60_79(76, ee, aa, bb, cc, dd, w);
+ step60_79(77, dd, ee, aa, bb, cc, w);
+ step60_79(78, cc, dd, ee, aa, bb, w);
+ step60_79(79, bb, cc, dd, ee, aa, w);
+
+ add_abcde(aa, 0, digests);
+ add_abcde(bb, 1, digests);
+ add_abcde(cc, 2, digests);
+ add_abcde(dd, 3, digests);
+ add_abcde(ee, 4, digests);
+}
+
+void mh_sha1_block_base(const uint8_t * input_data,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks)
+{
+ uint32_t i;
+
+ for (i = 0; i < num_blocks; i++) {
+ mh_sha1_single(input_data, digests, frame_buffer);
+ input_data += MH_SHA1_BLOCK_SIZE;
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm
new file mode 100644
index 000000000..3d75d1649
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm
@@ -0,0 +1,498 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using SSE
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regC
+ pxor %%regF,%%regD
+ pand %%regF,%%regB
+ pxor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regD
+ pxor %%regF,%%regC
+ pxor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regB
+ movdqa %%regT,%%regB
+ por %%regF,%%regC
+ pand %%regT,%%regC
+ pand %%regF,%%regD
+ por %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ pslld %%reg, %%imm
+ psrld %%tmp, (32-%%imm)
+ por %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ paddd %%regE,[%%data + (%%memW * 16)]
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define MOVPS movups
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h4 | h4 | h4 | ...| h4 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+
+;void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+mk_global mh_sha1_block_sse, function, internal
+func(mh_sha1_block_sse)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by sse
+ and rsp, ~0x0F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 5
+ MOVPS A, [mh_digests_p + I*64 + 16*0]
+ MOVPS B, [mh_digests_p + I*64 + 16*1]
+ MOVPS C, [mh_digests_p + I*64 + 16*2]
+ MOVPS D, [mh_digests_p + I*64 + 16*3]
+
+ movdqa [rsp + I*64 + 16*0], A
+ movdqa [rsp + I*64 + 16*1], B
+ movdqa [rsp + I*64 + 16*2], C
+ movdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+ MOVPS T0,[mh_in_p + I*64+0*16]
+ MOVPS T1,[mh_in_p + I*64+1*16]
+ MOVPS T2,[mh_in_p + I*64+2*16]
+ MOVPS T3,[mh_in_p + I*64+3*16]
+
+ pshufb T0, F
+ movdqa [mh_data_p +(I)*16 +0*256],T0
+ pshufb T1, F
+ movdqa [mh_data_p +(I)*16 +1*256],T1
+ pshufb T2, F
+ movdqa [mh_data_p +(I)*16 +2*256],T2
+ pshufb T3, F
+ movdqa [mh_data_p +(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ ;; Initialize digests
+ movdqa A, [rsp + 0*64 + mh_segs]
+ movdqa B, [rsp + 1*64 + mh_segs]
+ movdqa C, [rsp + 2*64 + mh_segs]
+ movdqa D, [rsp + 3*64 + mh_segs]
+ movdqa E, [rsp + 4*64 + mh_segs]
+
+ movdqa AA, A
+ movdqa BB, B
+ movdqa CC, C
+ movdqa DD, D
+ movdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ movdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+ movdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+ movdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+;; do rounds 20...39
+ movdqa K, [K20_39]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+ movdqa K, [K40_59]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 60...79
+ movdqa K, [K60_79]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ paddd A, AA
+ paddd B, BB
+ paddd C, CC
+ paddd D, DD
+ paddd E, EE
+
+ ; write out digests
+ movdqa [rsp + 0*64 + mh_segs], A
+ movdqa [rsp + 1*64 + mh_segs], B
+ movdqa [rsp + 2*64 + mh_segs], C
+ movdqa [rsp + 3*64 + mh_segs], D
+ movdqa [rsp + 4*64 + mh_segs], E
+
+ add pref, 256
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 5
+ movdqa A, [rsp + I*64 + 16*0]
+ movdqa B, [rsp + I*64 + 16*1]
+ movdqa C, [rsp + I*64 + 16*2]
+ movdqa D, [rsp + I*64 + 16*3]
+
+ MOVPS [mh_digests_p + I*64 + 16*0], A
+ MOVPS [mh_digests_p + I*64 + 16*1], B
+ MOVPS [mh_digests_p + I*64 + 16*2], C
+ MOVPS [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c
new file mode 100644
index 000000000..3058aaa87
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c
@@ -0,0 +1,122 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha1_finalize_base.c contains the prototypes of mh_sha1_finalize_XXX
+ * and mh_sha1_tail_XXX. Default definitions are base type which generates
+ * mh_sha1_finalize_base and mh_sha1_tail_base. Other types are generated
+ * through different predefined macros by mh_sha1.c.
+ * mh_sha1_tail is used to calculate the last incomplete block of input
+ * data. mh_sha1_finalize is the mh_sha1_ctx wrapper of mh_sha1_tail.
+ */
+#ifndef MH_SHA1_FINALIZE_FUNCTION
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_base
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_base
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_base
+#define MH_SHA1_FINALIZE_SLVER
+#endif
+
+void MH_SHA1_TAIL_FUNCTION(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], uint8_t * frame_buffer,
+ uint32_t digests[SHA1_DIGEST_WORDS])
+{
+ uint64_t partial_buffer_len, len_in_bit;
+
+ partial_buffer_len = total_len % MH_SHA1_BLOCK_SIZE;
+
+ // Padding the first block
+ partial_buffer[partial_buffer_len] = 0x80;
+ partial_buffer_len++;
+ memset(partial_buffer + partial_buffer_len, 0,
+ MH_SHA1_BLOCK_SIZE - partial_buffer_len);
+
+ // Calculate the first block without total_length if padding needs 2 block
+ if (partial_buffer_len > (MH_SHA1_BLOCK_SIZE - 8)) {
+ MH_SHA1_BLOCK_FUNCTION(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+ //Padding the second block
+ memset(partial_buffer, 0, MH_SHA1_BLOCK_SIZE);
+ }
+ //Padding the block
+ len_in_bit = to_be64((uint64_t) total_len * 8);
+ *(uint64_t *) (partial_buffer + MH_SHA1_BLOCK_SIZE - 8) = len_in_bit;
+ MH_SHA1_BLOCK_FUNCTION(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+
+ //Calculate multi-hash SHA1 digests (segment digests as input message)
+ sha1_for_mh_sha1((uint8_t *) mh_sha1_segs_digests, digests,
+ 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+int MH_SHA1_FINALIZE_FUNCTION(struct mh_sha1_ctx *ctx, void *mh_sha1_digest)
+{
+ uint8_t *partial_block_buffer;
+ uint64_t total_len;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+
+ if (ctx == NULL)
+ return MH_SHA1_CTX_ERROR_NULL;
+
+ total_len = ctx->total_length;
+ partial_block_buffer = ctx->partial_block_buffer;
+
+ /* mh_sha1 tail */
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+
+ MH_SHA1_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha1_segs_digests,
+ aligned_frame_buffer, ctx->mh_sha1_digest);
+
+ /* Output the digests of mh_sha1 */
+ if (mh_sha1_digest != NULL) {
+ ((uint32_t *) mh_sha1_digest)[0] = ctx->mh_sha1_digest[0];
+ ((uint32_t *) mh_sha1_digest)[1] = ctx->mh_sha1_digest[1];
+ ((uint32_t *) mh_sha1_digest)[2] = ctx->mh_sha1_digest[2];
+ ((uint32_t *) mh_sha1_digest)[3] = ctx->mh_sha1_digest[3];
+ ((uint32_t *) mh_sha1_digest)[4] = ctx->mh_sha1_digest[4];
+ }
+
+ return MH_SHA1_CTX_ERROR_NONE;
+}
+
+#ifdef MH_SHA1_FINALIZE_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver mh_sha1_finalize_base_slver_0000027b;
+struct slver mh_sha1_finalize_base_slver = { 0x027b, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h
new file mode 100644
index 000000000..81823048e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h
@@ -0,0 +1,308 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_INTERNAL_H_
+#define _MH_SHA1_INTERNAL_H_
+
+/**
+ * @file mh_sha1_internal.h
+ * @brief mh_sha1 internal function prototypes and macros
+ *
+ * Interface for mh_sha1 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha1.h"
+#include "endian_helper.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+ // 64byte pointer align
+#define ALIGN_64(pointer) ( ((uint64_t)(pointer) + 0x3F)&(~0x3F) )
+
+ /*******************************************************************
+ *mh_sha1 constants and macros
+ ******************************************************************/
+ /* mh_sha1 constants */
+#define MH_SHA1_H0 0x67452301UL
+#define MH_SHA1_H1 0xefcdab89UL
+#define MH_SHA1_H2 0x98badcfeUL
+#define MH_SHA1_H3 0x10325476UL
+#define MH_SHA1_H4 0xc3d2e1f0UL
+
+#define K_00_19 0x5a827999UL
+#define K_20_39 0x6ed9eba1UL
+#define K_40_59 0x8f1bbcdcUL
+#define K_60_79 0xca62c1d6UL
+
+ /* mh_sha1 macros */
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+ /*******************************************************************
+ * SHA1 API internal function prototypes
+ ******************************************************************/
+
+ /**
+ * @brief Performs complete SHA1 algorithm.
+ *
+ * @param input Pointer to buffer containing the input message.
+ * @param digest Pointer to digest to update.
+ * @param len Length of buffer.
+ * @returns None
+ */
+ void sha1_for_mh_sha1(const uint8_t * input_data, uint32_t * digest, const uint32_t len);
+
+ /*******************************************************************
+ * mh_sha1 API internal function prototypes
+ * Multiple versions of Update and Finalize functions are supplied which use
+ * multiple versions of block and tail process subfunctions.
+ ******************************************************************/
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @returns none
+ *
+ */
+ void mh_sha1_tail(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_base(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @requires SSE
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_sse(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @requires AVX
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_avx(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @requires AVX2
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_avx2(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @requires AVX512
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_avx512(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_base(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @requires SSE
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @requires AVX
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_avx(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @requires AVX2
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_avx2(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @requires AVX512
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_avx512(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm
new file mode 100644
index 000000000..590aa6c5f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm
@@ -0,0 +1,77 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ [bits 32]
+%else
+ default rel
+ [bits 64]
+
+ extern mh_sha1_update_sse
+ extern mh_sha1_update_avx
+ extern mh_sha1_update_avx2
+ extern mh_sha1_finalize_sse
+ extern mh_sha1_finalize_avx
+ extern mh_sha1_finalize_avx2
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ extern mh_sha1_update_avx512
+ extern mh_sha1_finalize_avx512
+ %endif
+
+%endif
+
+extern mh_sha1_update_base
+extern mh_sha1_finalize_base
+
+mbin_interface mh_sha1_update
+mbin_interface mh_sha1_finalize
+
+%ifidn __OUTPUT_FORMAT__, elf64
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 mh_sha1_update, mh_sha1_update_base, mh_sha1_update_sse, mh_sha1_update_avx, mh_sha1_update_avx2, mh_sha1_update_avx512
+ mbin_dispatch_init6 mh_sha1_finalize, mh_sha1_finalize_base, mh_sha1_finalize_sse, mh_sha1_finalize_avx, mh_sha1_finalize_avx2, mh_sha1_finalize_avx512
+ %else
+ mbin_dispatch_init5 mh_sha1_update, mh_sha1_update_base, mh_sha1_update_sse, mh_sha1_update_avx, mh_sha1_update_avx2
+ mbin_dispatch_init5 mh_sha1_finalize, mh_sha1_finalize_base, mh_sha1_finalize_sse, mh_sha1_finalize_avx, mh_sha1_finalize_avx2
+ %endif
+
+%else
+ mbin_dispatch_init2 mh_sha1_update, mh_sha1_update_base
+ mbin_dispatch_init2 mh_sha1_finalize, mh_sha1_finalize_base
+%endif
+
+;;; func core, ver, snum
+slversion mh_sha1_update, 00, 02, 0272
+slversion mh_sha1_finalize, 00, 02, 0273
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c
new file mode 100644
index 000000000..4fd6c09a1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c
@@ -0,0 +1,180 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_LEN 16*1024
+# define TEST_LOOPS 20000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define TEST_LEN 32*1024*1024
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#define TEST_MEM TEST_LEN
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_CTX_ERROR_NONE){ \
+ printf("The mh_sha1 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+
+ return mh_sha1_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, fail = 0;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ struct mh_sha1_ctx *update_ctx_test = NULL, *update_ctx_base = NULL;
+ struct perf start, stop;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n");
+
+ buff = malloc(TEST_LEN);
+ update_ctx_test = malloc(sizeof(*update_ctx_test));
+ update_ctx_base = malloc(sizeof(*update_ctx_base));
+
+ if (buff == NULL || update_ctx_base == NULL || update_ctx_test == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ // mh_sha1 base version
+ mh_sha1_init(update_ctx_base);
+ mh_sha1_update_base(update_ctx_base, buff, TEST_LEN);
+ mh_sha1_finalize_base(update_ctx_base, hash_base);
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS / 10; i++) {
+ mh_sha1_init(update_ctx_base);
+ mh_sha1_update_base(update_ctx_base, buff, TEST_LEN);
+ mh_sha1_finalize_base(update_ctx_base, hash_base);
+ }
+ perf_stop(&stop);
+ printf("mh_sha1_update_base" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ //Update feature test
+ CHECK_RETURN(mh_sha1_init(update_ctx_test));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ CHECK_RETURN(mh_sha1_init(update_ctx_test));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+ }
+ perf_stop(&stop);
+ printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ // Check results
+ fail = compare_digests(hash_base, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", TEST_LEN);
+ return -1;
+ }
+
+ if (fail)
+ printf("Test failed function test%d\n", fail);
+ else
+ printf("Pass func check\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c
new file mode 100644
index 000000000..71caba50e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c
@@ -0,0 +1,430 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+ // Macros and sub-functions which already exist in source code file
+ // (sha1_for_mh_sha1.c) is part of ISA-L library as internal functions.
+ // The reason why writing them twice is the linking issue caused by
+ // mh_sha1_ref(). mh_sha1_ref() needs these macros and sub-functions
+ // without linking ISA-L library. So mh_sha1_ref() includes them in
+ // order to contain essential sub-functions in its own object file.
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+ if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ else W(i) = to_be32(ww[i]); \
+ e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+ b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+ b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+ b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+ b = rol32(b,30)
+
+static void OPT_FIX sha1_single_for_mh_sha1_ref(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e;
+ uint32_t w[16] = { 0 };
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+
+ step00_19(0, a, b, c, d, e);
+ step00_19(1, e, a, b, c, d);
+ step00_19(2, d, e, a, b, c);
+ step00_19(3, c, d, e, a, b);
+ step00_19(4, b, c, d, e, a);
+ step00_19(5, a, b, c, d, e);
+ step00_19(6, e, a, b, c, d);
+ step00_19(7, d, e, a, b, c);
+ step00_19(8, c, d, e, a, b);
+ step00_19(9, b, c, d, e, a);
+ step00_19(10, a, b, c, d, e);
+ step00_19(11, e, a, b, c, d);
+ step00_19(12, d, e, a, b, c);
+ step00_19(13, c, d, e, a, b);
+ step00_19(14, b, c, d, e, a);
+ step00_19(15, a, b, c, d, e);
+ step00_19(16, e, a, b, c, d);
+ step00_19(17, d, e, a, b, c);
+ step00_19(18, c, d, e, a, b);
+ step00_19(19, b, c, d, e, a);
+
+ step20_39(20, a, b, c, d, e);
+ step20_39(21, e, a, b, c, d);
+ step20_39(22, d, e, a, b, c);
+ step20_39(23, c, d, e, a, b);
+ step20_39(24, b, c, d, e, a);
+ step20_39(25, a, b, c, d, e);
+ step20_39(26, e, a, b, c, d);
+ step20_39(27, d, e, a, b, c);
+ step20_39(28, c, d, e, a, b);
+ step20_39(29, b, c, d, e, a);
+ step20_39(30, a, b, c, d, e);
+ step20_39(31, e, a, b, c, d);
+ step20_39(32, d, e, a, b, c);
+ step20_39(33, c, d, e, a, b);
+ step20_39(34, b, c, d, e, a);
+ step20_39(35, a, b, c, d, e);
+ step20_39(36, e, a, b, c, d);
+ step20_39(37, d, e, a, b, c);
+ step20_39(38, c, d, e, a, b);
+ step20_39(39, b, c, d, e, a);
+
+ step40_59(40, a, b, c, d, e);
+ step40_59(41, e, a, b, c, d);
+ step40_59(42, d, e, a, b, c);
+ step40_59(43, c, d, e, a, b);
+ step40_59(44, b, c, d, e, a);
+ step40_59(45, a, b, c, d, e);
+ step40_59(46, e, a, b, c, d);
+ step40_59(47, d, e, a, b, c);
+ step40_59(48, c, d, e, a, b);
+ step40_59(49, b, c, d, e, a);
+ step40_59(50, a, b, c, d, e);
+ step40_59(51, e, a, b, c, d);
+ step40_59(52, d, e, a, b, c);
+ step40_59(53, c, d, e, a, b);
+ step40_59(54, b, c, d, e, a);
+ step40_59(55, a, b, c, d, e);
+ step40_59(56, e, a, b, c, d);
+ step40_59(57, d, e, a, b, c);
+ step40_59(58, c, d, e, a, b);
+ step40_59(59, b, c, d, e, a);
+
+ step60_79(60, a, b, c, d, e);
+ step60_79(61, e, a, b, c, d);
+ step60_79(62, d, e, a, b, c);
+ step60_79(63, c, d, e, a, b);
+ step60_79(64, b, c, d, e, a);
+ step60_79(65, a, b, c, d, e);
+ step60_79(66, e, a, b, c, d);
+ step60_79(67, d, e, a, b, c);
+ step60_79(68, c, d, e, a, b);
+ step60_79(69, b, c, d, e, a);
+ step60_79(70, a, b, c, d, e);
+ step60_79(71, e, a, b, c, d);
+ step60_79(72, d, e, a, b, c);
+ step60_79(73, c, d, e, a, b);
+ step60_79(74, b, c, d, e, a);
+ step60_79(75, a, b, c, d, e);
+ step60_79(76, e, a, b, c, d);
+ step60_79(77, d, e, a, b, c);
+ step60_79(78, c, d, e, a, b);
+ step60_79(79, b, c, d, e, a);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+}
+
+void sha1_for_mh_sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA1_BLOCK_SIZE];
+
+ digest[0] = MH_SHA1_H0;
+ digest[1] = MH_SHA1_H1;
+ digest[2] = MH_SHA1_H2;
+ digest[3] = MH_SHA1_H3;
+ digest[4] = MH_SHA1_H4;
+
+ i = len;
+ while (i >= SHA1_BLOCK_SIZE) {
+ sha1_single_for_mh_sha1_ref(input_data, digest);
+ input_data += SHA1_BLOCK_SIZE;
+ i -= SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - 8); j++)
+ buf[j] = 0;
+
+ if (i > SHA1_BLOCK_SIZE - 8)
+ i = 2 * SHA1_BLOCK_SIZE;
+ else
+ i = SHA1_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha1_single_for_mh_sha1_ref(buf, digest);
+ if (i == (2 * SHA1_BLOCK_SIZE))
+ sha1_single_for_mh_sha1_ref(buf + SHA1_BLOCK_SIZE, digest);
+}
+
+/*
+ * buffer to rearrange one segment data from one block.
+ *
+ * Layout of new_data:
+ * segment
+ * -------------------------
+ * w0 | w1 | ... | w15
+ *
+ */
+static inline void transform_input_single(uint32_t * new_data, uint32_t * input,
+ uint32_t segment)
+{
+ new_data[16 * segment + 0] = input[16 * 0 + segment];
+ new_data[16 * segment + 1] = input[16 * 1 + segment];
+ new_data[16 * segment + 2] = input[16 * 2 + segment];
+ new_data[16 * segment + 3] = input[16 * 3 + segment];
+ new_data[16 * segment + 4] = input[16 * 4 + segment];
+ new_data[16 * segment + 5] = input[16 * 5 + segment];
+ new_data[16 * segment + 6] = input[16 * 6 + segment];
+ new_data[16 * segment + 7] = input[16 * 7 + segment];
+ new_data[16 * segment + 8] = input[16 * 8 + segment];
+ new_data[16 * segment + 9] = input[16 * 9 + segment];
+ new_data[16 * segment + 10] = input[16 * 10 + segment];
+ new_data[16 * segment + 11] = input[16 * 11 + segment];
+ new_data[16 * segment + 12] = input[16 * 12 + segment];
+ new_data[16 * segment + 13] = input[16 * 13 + segment];
+ new_data[16 * segment + 14] = input[16 * 14 + segment];
+ new_data[16 * segment + 15] = input[16 * 15 + segment];
+}
+
+// Adapt parameters to sha1_single_for_mh_sha1_ref
+#define sha1_update_one_seg(data, digest) \
+ sha1_single_for_mh_sha1_ref((const uint8_t *)(data), (uint32_t *)(digest))
+
+/*
+ * buffer to Rearrange all segments data from one block.
+ *
+ * Layout of new_data:
+ * segment
+ * -------------------------
+ * seg0: | w0 | w1 | ... | w15
+ * seg1: | w0 | w1 | ... | w15
+ * seg2: | w0 | w1 | ... | w15
+ * ....
+ * seg15: | w0 | w1 | ... | w15
+ *
+ */
+static inline void transform_input(uint32_t * new_data, uint32_t * input, uint32_t block)
+{
+ uint32_t *current_input = input + block * MH_SHA1_BLOCK_SIZE / 4;
+
+ transform_input_single(new_data, current_input, 0);
+ transform_input_single(new_data, current_input, 1);
+ transform_input_single(new_data, current_input, 2);
+ transform_input_single(new_data, current_input, 3);
+ transform_input_single(new_data, current_input, 4);
+ transform_input_single(new_data, current_input, 5);
+ transform_input_single(new_data, current_input, 6);
+ transform_input_single(new_data, current_input, 7);
+ transform_input_single(new_data, current_input, 8);
+ transform_input_single(new_data, current_input, 9);
+ transform_input_single(new_data, current_input, 10);
+ transform_input_single(new_data, current_input, 11);
+ transform_input_single(new_data, current_input, 12);
+ transform_input_single(new_data, current_input, 13);
+ transform_input_single(new_data, current_input, 14);
+ transform_input_single(new_data, current_input, 15);
+
+}
+
+/*
+ * buffer to Calculate all segments' digests from one block.
+ *
+ * Layout of seg_digest:
+ * segment
+ * -------------------------
+ * seg0: | H0 | H1 | ... | H4
+ * seg1: | H0 | H1 | ... | H4
+ * seg2: | H0 | H1 | ... | H4
+ * ....
+ * seg15: | H0 | H1 | ... | H4
+ *
+ */
+static inline void sha1_update_all_segs(uint32_t * new_data,
+ uint32_t(*mh_sha1_seg_digests)[SHA1_DIGEST_WORDS])
+{
+ sha1_update_one_seg(&(new_data)[16 * 0], mh_sha1_seg_digests[0]);
+ sha1_update_one_seg(&(new_data)[16 * 1], mh_sha1_seg_digests[1]);
+ sha1_update_one_seg(&(new_data)[16 * 2], mh_sha1_seg_digests[2]);
+ sha1_update_one_seg(&(new_data)[16 * 3], mh_sha1_seg_digests[3]);
+ sha1_update_one_seg(&(new_data)[16 * 4], mh_sha1_seg_digests[4]);
+ sha1_update_one_seg(&(new_data)[16 * 5], mh_sha1_seg_digests[5]);
+ sha1_update_one_seg(&(new_data)[16 * 6], mh_sha1_seg_digests[6]);
+ sha1_update_one_seg(&(new_data)[16 * 7], mh_sha1_seg_digests[7]);
+ sha1_update_one_seg(&(new_data)[16 * 8], mh_sha1_seg_digests[8]);
+ sha1_update_one_seg(&(new_data)[16 * 9], mh_sha1_seg_digests[9]);
+ sha1_update_one_seg(&(new_data)[16 * 10], mh_sha1_seg_digests[10]);
+ sha1_update_one_seg(&(new_data)[16 * 11], mh_sha1_seg_digests[11]);
+ sha1_update_one_seg(&(new_data)[16 * 12], mh_sha1_seg_digests[12]);
+ sha1_update_one_seg(&(new_data)[16 * 13], mh_sha1_seg_digests[13]);
+ sha1_update_one_seg(&(new_data)[16 * 14], mh_sha1_seg_digests[14]);
+ sha1_update_one_seg(&(new_data)[16 * 15], mh_sha1_seg_digests[15]);
+}
+
+void mh_sha1_block_ref(const uint8_t * input_data, uint32_t(*digests)[HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks)
+{
+ uint32_t i, j;
+ uint32_t *temp_buffer = (uint32_t *) frame_buffer;
+ uint32_t(*trans_digests)[SHA1_DIGEST_WORDS];
+
+ trans_digests = (uint32_t(*)[SHA1_DIGEST_WORDS]) digests;
+
+ // Re-structure seg_digests from 5*16 to 16*5
+ for (j = 0; j < HASH_SEGS; j++) {
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ temp_buffer[j * SHA1_DIGEST_WORDS + i] = digests[i][j];
+ }
+ }
+ memcpy(trans_digests, temp_buffer, 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+ // Calculate digests for all segments, leveraging sha1 API
+ for (i = 0; i < num_blocks; i++) {
+ transform_input(temp_buffer, (uint32_t *) input_data, i);
+ sha1_update_all_segs(temp_buffer, trans_digests);
+ }
+
+ // Re-structure seg_digests from 16*5 to 5*16
+ for (j = 0; j < HASH_SEGS; j++) {
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ temp_buffer[i * HASH_SEGS + j] = trans_digests[j][i];
+ }
+ }
+ memcpy(digests, temp_buffer, 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+void mh_sha1_tail_ref(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], uint8_t * frame_buffer,
+ uint32_t digests[SHA1_DIGEST_WORDS])
+{
+ uint64_t partial_buffer_len, len_in_bit;
+
+ partial_buffer_len = total_len % MH_SHA1_BLOCK_SIZE;
+
+ // Padding the first block
+ partial_buffer[partial_buffer_len] = 0x80;
+ partial_buffer_len++;
+ memset(partial_buffer + partial_buffer_len, 0,
+ MH_SHA1_BLOCK_SIZE - partial_buffer_len);
+
+ // Calculate the first block without total_length if padding needs 2 block
+ if (partial_buffer_len > (MH_SHA1_BLOCK_SIZE - 8)) {
+ mh_sha1_block_ref(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+ //Padding the second block
+ memset(partial_buffer, 0, MH_SHA1_BLOCK_SIZE);
+ }
+ //Padding the block
+ len_in_bit = to_be64((uint64_t) total_len * 8);
+ *(uint64_t *) (partial_buffer + MH_SHA1_BLOCK_SIZE - 8) = len_in_bit;
+ mh_sha1_block_ref(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+
+ //Calculate multi-hash SHA1 digests (segment digests as input message)
+ sha1_for_mh_sha1_ref((uint8_t *) mh_sha1_segs_digests, digests,
+ 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest)
+{
+ uint64_t total_len;
+ uint64_t num_blocks;
+ uint32_t mh_sha1_segs_digests[SHA1_DIGEST_WORDS][HASH_SEGS];
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE];
+ uint8_t partial_block_buffer[MH_SHA1_BLOCK_SIZE * 2];
+ uint32_t mh_sha1_hash_dword[SHA1_DIGEST_WORDS];
+ uint32_t i;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ /* Initialize digests of all segments */
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha1_segs_digests[0][i] = MH_SHA1_H0;
+ mh_sha1_segs_digests[1][i] = MH_SHA1_H1;
+ mh_sha1_segs_digests[2][i] = MH_SHA1_H2;
+ mh_sha1_segs_digests[3][i] = MH_SHA1_H3;
+ mh_sha1_segs_digests[4][i] = MH_SHA1_H4;
+ }
+
+ total_len = len;
+
+ // Calculate blocks
+ num_blocks = len / MH_SHA1_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ mh_sha1_block_ref(input_data, mh_sha1_segs_digests, frame_buffer, num_blocks);
+ len -= num_blocks * MH_SHA1_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA1_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ /* Finalize */
+ mh_sha1_tail_ref(partial_block_buffer, total_len, mh_sha1_segs_digests,
+ frame_buffer, mh_sha1_hash_dword);
+
+ // Output the digests of mh_sha1
+ if (mh_sha1_digest != NULL) {
+ mh_sha1_digest[0] = mh_sha1_hash_dword[0];
+ mh_sha1_digest[1] = mh_sha1_hash_dword[1];
+ mh_sha1_digest[2] = mh_sha1_hash_dword[2];
+ mh_sha1_digest[3] = mh_sha1_hash_dword[3];
+ mh_sha1_digest[4] = mh_sha1_hash_dword[4];
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c
new file mode 100644
index 000000000..792c4452b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c
@@ -0,0 +1,217 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_CTX_ERROR_NONE){ \
+ printf("The mh_sha1 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+#define MH_SHA1_REF mh_sha1_ref
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_ref[i])
+ mh_sha1_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("ref: ");
+ dump((char *)hash_ref, 20);
+ printf("test: ");
+ dump((char *)hash_test, 20);
+ }
+
+ return mh_sha1_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_ref[SHA1_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int size, offset;
+ struct mh_sha1_ctx *update_ctx = NULL;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:\n");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ MH_SHA1_REF(buff, TEST_LEN, hash_ref);
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages
+ for (size = TEST_LEN; size >= 0; size--) {
+
+ // Fill with rand data
+ rand_buffer(buff, size);
+
+ MH_SHA1_REF(buff, size, hash_ref);
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various buffer offsets and sizes
+ printf("offset tests");
+ for (size = TEST_LEN - 256; size > 256; size -= 11) {
+ for (offset = 0; offset < 256; offset++) {
+ MH_SHA1_REF(buff + offset, size, hash_ref);
+
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ }
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Run efence tests
+ printf("efence tests");
+ for (size = TEST_SIZE; size > 0; size--) {
+ offset = TEST_LEN - size;
+
+ MH_SHA1_REF(buff + offset, size, hash_ref);
+
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+ printf(" %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c
new file mode 100644
index 000000000..4af220299
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c
@@ -0,0 +1,110 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha1_update_base.c contains the prototype of mh_sha1_update_XXX.
+ * Default definitions are base type which generates mh_sha1_update_base.
+ * Other types are generated through different predefined macros by mh_sha1.c.
+ */
+#ifndef MH_SHA1_UPDATE_FUNCTION
+#include "mh_sha1_internal.h"
+#include <string.h>
+
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_base
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_base
+#define MH_SHA1_UPDATE_SLVER
+#endif
+
+int MH_SHA1_UPDATE_FUNCTION(struct mh_sha1_ctx *ctx, const void *buffer, uint32_t len)
+{
+
+ uint8_t *partial_block_buffer;
+ uint64_t partial_block_len;
+ uint64_t num_blocks;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ if (ctx == NULL)
+ return MH_SHA1_CTX_ERROR_NULL;
+
+ if (len == 0)
+ return MH_SHA1_CTX_ERROR_NONE;
+
+ partial_block_len = ctx->total_length % MH_SHA1_BLOCK_SIZE;
+ partial_block_buffer = ctx->partial_block_buffer;
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+
+ ctx->total_length += len;
+ // No enough input data for mh_sha1 calculation
+ if (len + partial_block_len < MH_SHA1_BLOCK_SIZE) {
+ memcpy(partial_block_buffer + partial_block_len, input_data, len);
+ return MH_SHA1_CTX_ERROR_NONE;
+ }
+ // mh_sha1 calculation for the previous partial block
+ if (partial_block_len != 0) {
+ memcpy(partial_block_buffer + partial_block_len, input_data,
+ MH_SHA1_BLOCK_SIZE - partial_block_len);
+ //do one_block process
+ MH_SHA1_BLOCK_FUNCTION(partial_block_buffer, mh_sha1_segs_digests,
+ aligned_frame_buffer, 1);
+ input_data += MH_SHA1_BLOCK_SIZE - partial_block_len;
+ len -= MH_SHA1_BLOCK_SIZE - partial_block_len;
+ memset(partial_block_buffer, 0, MH_SHA1_BLOCK_SIZE);
+ }
+ // Calculate mh_sha1 for the current blocks
+ num_blocks = len / MH_SHA1_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ MH_SHA1_BLOCK_FUNCTION(input_data, mh_sha1_segs_digests, aligned_frame_buffer,
+ num_blocks);
+ len -= num_blocks * MH_SHA1_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA1_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ return MH_SHA1_CTX_ERROR_NONE;
+
+}
+
+#ifdef MH_SHA1_UPDATE_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+ // Version info
+struct slver mh_sha1_update_base_slver_0000027a;
+struct slver mh_sha1_update_base_slver = { 0x027a, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c
new file mode 100644
index 000000000..942dfd09f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c
@@ -0,0 +1,240 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_CTX_ERROR_NONE){ \
+ printf("The mh_sha1 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_ref[i])
+ mh_sha1_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("ref: ");
+ dump((char *)hash_ref, 20);
+ printf("test: ");
+ dump((char *)hash_test, 20);
+ }
+
+ return mh_sha1_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0, i;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_ref[SHA1_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int update_count;
+ int size1, size2, offset, addr_offset;
+ struct mh_sha1_ctx *update_ctx = NULL;
+ uint8_t *mem_addr = NULL;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages by update twice.
+ printf("\n various size messages by update twice tests");
+ for (size1 = TEST_LEN; size1 >= 0; size1--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+ // subsequent update
+ size2 = TEST_LEN - size1; // size2 is different with the former
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various update count
+ printf("\n various update count tests");
+ for (update_count = 1; update_count <= TEST_LEN; update_count++) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+ // subsequent update
+ size1 = TEST_LEN / update_count;
+ size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former
+
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ for (i = 1, offset = 0; i < update_count; i++) {
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1));
+ offset += size1;
+ }
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // test various start address of ctx.
+ printf("\n various start address of ctx test");
+ free(update_ctx);
+ mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10);
+ for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+ // a unaligned offset
+ update_ctx = (struct mh_sha1_ctx *)(mem_addr + addr_offset);
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail addr_offset=%d\n", addr_offset);
+ return -1;
+ }
+
+ if ((addr_offset & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c
new file mode 100644
index 000000000..224977e6c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c
@@ -0,0 +1,204 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha1_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA1 Functions for mh_sha1
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+ if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ else W(i) = to_be32(ww[i]); \
+ e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+ b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+ b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+ b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+ b = rol32(b,30)
+
+static void OPT_FIX sha1_single_for_mh_sha1(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e;
+ uint32_t w[16] = { 0 };
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+
+ step00_19(0, a, b, c, d, e);
+ step00_19(1, e, a, b, c, d);
+ step00_19(2, d, e, a, b, c);
+ step00_19(3, c, d, e, a, b);
+ step00_19(4, b, c, d, e, a);
+ step00_19(5, a, b, c, d, e);
+ step00_19(6, e, a, b, c, d);
+ step00_19(7, d, e, a, b, c);
+ step00_19(8, c, d, e, a, b);
+ step00_19(9, b, c, d, e, a);
+ step00_19(10, a, b, c, d, e);
+ step00_19(11, e, a, b, c, d);
+ step00_19(12, d, e, a, b, c);
+ step00_19(13, c, d, e, a, b);
+ step00_19(14, b, c, d, e, a);
+ step00_19(15, a, b, c, d, e);
+ step00_19(16, e, a, b, c, d);
+ step00_19(17, d, e, a, b, c);
+ step00_19(18, c, d, e, a, b);
+ step00_19(19, b, c, d, e, a);
+
+ step20_39(20, a, b, c, d, e);
+ step20_39(21, e, a, b, c, d);
+ step20_39(22, d, e, a, b, c);
+ step20_39(23, c, d, e, a, b);
+ step20_39(24, b, c, d, e, a);
+ step20_39(25, a, b, c, d, e);
+ step20_39(26, e, a, b, c, d);
+ step20_39(27, d, e, a, b, c);
+ step20_39(28, c, d, e, a, b);
+ step20_39(29, b, c, d, e, a);
+ step20_39(30, a, b, c, d, e);
+ step20_39(31, e, a, b, c, d);
+ step20_39(32, d, e, a, b, c);
+ step20_39(33, c, d, e, a, b);
+ step20_39(34, b, c, d, e, a);
+ step20_39(35, a, b, c, d, e);
+ step20_39(36, e, a, b, c, d);
+ step20_39(37, d, e, a, b, c);
+ step20_39(38, c, d, e, a, b);
+ step20_39(39, b, c, d, e, a);
+
+ step40_59(40, a, b, c, d, e);
+ step40_59(41, e, a, b, c, d);
+ step40_59(42, d, e, a, b, c);
+ step40_59(43, c, d, e, a, b);
+ step40_59(44, b, c, d, e, a);
+ step40_59(45, a, b, c, d, e);
+ step40_59(46, e, a, b, c, d);
+ step40_59(47, d, e, a, b, c);
+ step40_59(48, c, d, e, a, b);
+ step40_59(49, b, c, d, e, a);
+ step40_59(50, a, b, c, d, e);
+ step40_59(51, e, a, b, c, d);
+ step40_59(52, d, e, a, b, c);
+ step40_59(53, c, d, e, a, b);
+ step40_59(54, b, c, d, e, a);
+ step40_59(55, a, b, c, d, e);
+ step40_59(56, e, a, b, c, d);
+ step40_59(57, d, e, a, b, c);
+ step40_59(58, c, d, e, a, b);
+ step40_59(59, b, c, d, e, a);
+
+ step60_79(60, a, b, c, d, e);
+ step60_79(61, e, a, b, c, d);
+ step60_79(62, d, e, a, b, c);
+ step60_79(63, c, d, e, a, b);
+ step60_79(64, b, c, d, e, a);
+ step60_79(65, a, b, c, d, e);
+ step60_79(66, e, a, b, c, d);
+ step60_79(67, d, e, a, b, c);
+ step60_79(68, c, d, e, a, b);
+ step60_79(69, b, c, d, e, a);
+ step60_79(70, a, b, c, d, e);
+ step60_79(71, e, a, b, c, d);
+ step60_79(72, d, e, a, b, c);
+ step60_79(73, c, d, e, a, b);
+ step60_79(74, b, c, d, e, a);
+ step60_79(75, a, b, c, d, e);
+ step60_79(76, e, a, b, c, d);
+ step60_79(77, d, e, a, b, c);
+ step60_79(78, c, d, e, a, b);
+ step60_79(79, b, c, d, e, a);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+}
+
+void sha1_for_mh_sha1(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA1_BLOCK_SIZE];
+
+ digest[0] = MH_SHA1_H0;
+ digest[1] = MH_SHA1_H1;
+ digest[2] = MH_SHA1_H2;
+ digest[3] = MH_SHA1_H3;
+ digest[4] = MH_SHA1_H4;
+
+ i = len;
+ while (i >= SHA1_BLOCK_SIZE) {
+ sha1_single_for_mh_sha1(input_data, digest);
+ input_data += SHA1_BLOCK_SIZE;
+ i -= SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - 8); j++)
+ buf[j] = 0;
+
+ if (i > SHA1_BLOCK_SIZE - 8)
+ i = 2 * SHA1_BLOCK_SIZE;
+ else
+ i = SHA1_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha1_single_for_mh_sha1(buf, digest);
+ if (i == (2 * SHA1_BLOCK_SIZE))
+ sha1_single_for_mh_sha1(buf + SHA1_BLOCK_SIZE, digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am
new file mode 100644
index 000000000..e6ea6784c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am
@@ -0,0 +1,89 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_murmur = mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c
+
+lsrc_stitch = mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm
+
+lsrc_stitch += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
+
+lsrc_x86_64 += $(lsrc_murmur) \
+ $(lsrc_stitch)
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += $(lsrc_murmur) \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \
+ mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c \
+ mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c \
+ mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S \
+ mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c \
+ mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S \
+ mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S
+
+lsrc_base_aliases += $(lsrc_murmur) \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c
+
+other_src += include/reg_sizes.asm \
+ include/multibinary.asm \
+ include/test.h \
+ mh_sha1/mh_sha1_internal.h \
+ mh_sha1_murmur3_x64_128/murmur3_x64_128.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h
+
+src_include += -I $(srcdir)/mh_sha1_murmur3_x64_128
+
+extern_hdrs += include/mh_sha1_murmur3_x64_128.h
+
+unit_tests += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test
+
+perf_tests += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf
+
+
+mh_sha1_murmur3_x64_128_test: mh_sha1_ref.o murmur3_x64_128.o
+mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_test_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la
+
+mh_sha1_murmur3_x64_128_update_test: mh_sha1_ref.o murmur3_x64_128.o
+mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_update_test_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la
+
+mh_sha1_murmur3_x64_128_perf: mh_sha1_ref.o murmur3_x64_128.o
+mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_perf_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c
new file mode 100644
index 000000000..e6993703a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_dispatcher.c
@@ -0,0 +1,53 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha1_murmur3_x64_128_update)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(mh_sha1_murmur3_update_ce);
+
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(mh_sha1_murmur3_update_asimd);
+
+ return PROVIDER_BASIC(mh_sha1_murmur3_x64_128_update);
+}
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha1_murmur3_x64_128_finalize)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(mh_sha1_murmur3_finalize_ce);
+
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(mh_sha1_murmur3_finalize_asimd);
+
+ return PROVIDER_BASIC(mh_sha1_murmur3_x64_128_finalize);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h
new file mode 100644
index 000000000..22b33cbd2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_aarch64_internal.h
@@ -0,0 +1,91 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_MURMUR3_AARCH64_INTERNAL_H_
+#define _MH_SHA1_MURMUR3_AARCH64_INTERNAL_H_
+
+/**
+ * @file mh_sha1_murmur3_aarch64_internal.h
+ * @brief mh_sha1_murmur3_aarch64 internal function prototypes and macros
+ *
+ * Interface for mh_sha1_murmur3_aarch64 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires Crypto Extension
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+void mh_sha1_murmur3_block_ce(const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t
+ murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires ASIMD
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+void mh_sha1_murmur3_block_asimd(const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t
+ murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c
new file mode 100644
index 000000000..9cac8504e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_asimd.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_aarch64_internal.h"
+
+extern void mh_sha1_tail_asimd(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t * frame_buffer,
+ uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+extern void mh_sha1_block_asimd(const uint8_t * input_data,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+// mh_sha1_murmur3_update_asimd.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_update_asimd
+#define BLOCK_FUNCTION mh_sha1_murmur3_block_asimd
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_finalize_asimd.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_finalize_asimd
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_asimd
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S
new file mode 100644
index 000000000..575129f36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_asimd.S
@@ -0,0 +1,224 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+#include "sha1_asimd_common.S"
+.macro sha1_step_16_79_interleave0 windex:req
+ // interleaving murmur3 operation
+ .if (\windex % 4) == 0
+ ldp mur_data1, mur_data2, [mur_data], #16
+ .endif
+ .if (\windex % 4) == 1
+ /* rotate left by 31 bits */
+ ror mur_data1, mur_data1, #64-31
+ /* rotate left by 33 bits */
+ ror mur_data2, mur_data2, #64-33
+ .endif
+ .if (\windex % 4) == 2
+ eor mur_hash1, mur_hash1, mur_data1
+ /* rotate left by 27 bits */
+ ror mur_hash1, mur_hash1, #64-27
+ .endif
+ .if (\windex % 4) == 3
+ eor mur_hash2, mur_hash2, mur_data2
+ /* rotate left by 31 bits */
+ ror mur_hash2, mur_hash2, #64-31
+ .endif
+.endm
+
+.macro sha1_step_16_79_interleave1 windex:req
+ // interleaving murmur3 operation
+ .if (\windex % 4) == 0
+ mul mur_data1, mur_data1, mur_c1
+ mul mur_data2, mur_data2, mur_c2
+ .endif
+ .if (\windex % 4) == 1
+ mul mur_data1, mur_data1, mur_c2
+ mul mur_data2, mur_data2, mur_c1
+ .endif
+ .if (\windex % 4) == 2
+ add mur_hash1, mur_hash1, mur_hash2
+ //mur_hash1 = mur_hash1 * 5 + N1
+ add mur_hash1, mur_hash1, mur_hash1, LSL #2
+ add mur_hash1, mur_n1, mur_hash1
+ .endif
+ .if (\windex % 4) == 3
+ add mur_hash2, mur_hash2, mur_hash1
+ // mur_hash2 = mur_hash2 * 5 + N2
+ add mur_hash2, mur_hash2, mur_hash2, LSL #2
+ add mur_hash2, mur_n2, mur_hash2
+ .endif
+.endm
+
+.macro load_x4_word idx:req
+ ld1 {WORD\idx\().16b},[segs_ptr]
+ add segs_ptr,segs_ptr,#64
+.endm
+
+/*
+ * void mh_sha1_murmur3_block_asimd (const uint8_t * input_data,
+ * uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ * uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ * uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ * uint32_t num_blocks);
+ * arg 0 pointer to input data
+ * arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+ * arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+ * arg 3 pointer to murmur3 digest
+ * arg 4 number of 1KB blocks
+ */
+
+ input_data .req x0
+ sha1_digest .req x1
+ data_buf .req x2
+ mur_digest .req x3
+ num_blocks .req w4
+
+ src .req x5
+ dst .req x6
+ offs .req x7
+ mh_segs .req x8
+ tmp .req x9
+ tmpw .req w9
+ segs_ptr .req x10
+ mur_hash1 .req x11
+ mur_hash2 .req x12
+ mur_c1 .req x13
+ mur_c2 .req x14
+ mur_data1 .req x19
+ mur_data2 .req x20
+ mur_data .req x21
+ mur_n1 .req x22
+ mur_n1_w .req w22
+ mur_n2 .req x23
+ mur_n2_w .req w23
+ block_ctr .req w24
+
+ .global mh_sha1_murmur3_block_asimd
+ .type mh_sha1_murmur3_block_asimd, %function
+mh_sha1_murmur3_block_asimd:
+ cmp num_blocks, #0
+ beq .return
+ sha1_asimd_save_stack
+ stp x19, x20, [sp, -48]!
+ stp x21, x22, [sp, 16]
+ stp x23, x24, [sp, 32]
+
+ mov mur_data, input_data
+ ldr mur_hash1, [mur_digest]
+ ldr mur_hash2, [mur_digest, 8]
+ adr mur_c1, C1
+ ldr mur_c1, [mur_c1]
+ adr mur_c2, C2
+ ldr mur_c2, [mur_c2]
+ adr tmp, N1
+ ldr mur_n1_w, [tmp]
+ adr tmp, N2
+ ldr mur_n2_w, [tmp]
+
+ mov mh_segs, #0
+.seg_loops:
+ add segs_ptr,input_data,mh_segs
+ mov offs, #64
+ add src, sha1_digest, mh_segs
+ ld1 {VA.4S}, [src], offs
+ ld1 {VB.4S}, [src], offs
+ ld1 {VC.4S}, [src], offs
+ ld1 {VD.4S}, [src], offs
+ ld1 {VE.4S}, [src], offs
+ mov block_ctr,num_blocks
+
+.block_loop:
+ sha1_single
+ subs block_ctr, block_ctr, 1
+ bne .block_loop
+
+ mov offs, #64
+ add dst, sha1_digest, mh_segs
+ st1 {VA.4S}, [dst], offs
+ st1 {VB.4S}, [dst], offs
+ st1 {VC.4S}, [dst], offs
+ st1 {VD.4S}, [dst], offs
+ st1 {VE.4S}, [dst], offs
+
+ add mh_segs, mh_segs, #16
+ cmp mh_segs, #64
+ bne .seg_loops
+
+ /* save murmur-hash digest */
+ str mur_hash1, [mur_digest], #8
+ str mur_hash2, [mur_digest]
+
+ ldp x21, x22, [sp, 16]
+ ldp x23, x24, [sp, 32]
+ ldp x19, x20, [sp], 48
+ sha1_asimd_restore_stack
+.return:
+ ret
+
+ .size mh_sha1_murmur3_block_asimd, .-mh_sha1_murmur3_block_asimd
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+KEY_0:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+KEY_1:
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+KEY_2:
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+KEY_3:
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+N1:
+ .word 0x52dce729
+ .word 0x52dce729
+ .word 0x52dce729
+ .word 0x52dce729
+N2:
+ .word 0x38495ab5
+ .word 0x38495ab5
+ .word 0x38495ab5
+ .word 0x38495ab5
+C1:
+ .dword 0x87c37b91114253d5
+ .dword 0x87c37b91114253d5
+C2:
+ .dword 0x4cf5ad432745937f
+ .dword 0x4cf5ad432745937f
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S
new file mode 100644
index 000000000..7f4256e20
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_block_ce.S
@@ -0,0 +1,482 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg lane0_msg_0, 0
+ declare_var_vector_reg lane1_msg_0, 1
+ declare_var_vector_reg lane2_msg_0, 2
+ declare_var_vector_reg lane3_msg_0, 3
+ declare_var_vector_reg lane0_msg_1, 4
+ declare_var_vector_reg lane1_msg_1, 5
+ declare_var_vector_reg lane2_msg_1, 6
+ declare_var_vector_reg lane3_msg_1, 7
+ declare_var_vector_reg lane0_msg_2, 8
+ declare_var_vector_reg lane1_msg_2, 9
+ declare_var_vector_reg lane2_msg_2,10
+ declare_var_vector_reg lane3_msg_2,11
+ declare_var_vector_reg lane0_msg_3,12
+ declare_var_vector_reg lane1_msg_3,13
+ declare_var_vector_reg lane2_msg_3,14
+ declare_var_vector_reg lane3_msg_3,15
+
+ declare_var_vector_reg lane0_abcd ,16
+ declare_var_vector_reg lane1_abcd ,17
+ declare_var_vector_reg lane2_abcd ,18
+ declare_var_vector_reg lane3_abcd ,19
+ declare_var_vector_reg lane0_tmp0 ,20
+ declare_var_vector_reg lane1_tmp0 ,21
+ declare_var_vector_reg lane2_tmp0 ,22
+ declare_var_vector_reg lane3_tmp0 ,23
+ declare_var_vector_reg lane0_tmp1 ,24
+ declare_var_vector_reg lane1_tmp1 ,25
+ declare_var_vector_reg lane2_tmp1 ,26
+ declare_var_vector_reg lane3_tmp1 ,27
+
+
+ declare_var_vector_reg e0 ,28
+ declare_var_vector_reg e1 ,29
+ declare_var_vector_reg key ,30
+ declare_var_vector_reg tmp ,31
+
+ key_adr .req x5
+ msg_adr .req x6
+ block_cnt .req x7
+ offs .req x8
+ mur_n1 .req x9
+ mur_n1_w .req w9
+ mur_n2 .req x10
+ mur_n2_w .req w10
+ mur_hash1 .req x11
+ mur_hash2 .req x12
+ mur_c1 .req x13
+ mur_c2 .req x14
+ mur_data1 .req x15
+
+ digest_adr .req x16
+ tmp0_adr .req x17
+ tmp1_adr .req x18
+ mur_data2 .req x19
+ mur_data .req x20
+
+.macro murmur3_00
+ ldp mur_data1, mur_data2, [mur_data], #16
+ mul mur_data1, mur_data1, mur_c1
+ mul mur_data2, mur_data2, mur_c2
+.endm
+
+.macro murmur3_01
+ /* rotate left by 31 bits */
+ ror mur_data1, mur_data1, #64-31
+ /* rotate left by 33 bits */
+ ror mur_data2, mur_data2, #64-33
+ mul mur_data1, mur_data1, mur_c2
+ mul mur_data2, mur_data2, mur_c1
+.endm
+
+.macro murmur3_02
+ eor mur_hash1, mur_hash1, mur_data1
+ /* rotate left by 27 bits */
+ ror mur_hash1, mur_hash1, #64-27
+ add mur_hash1, mur_hash1, mur_hash2
+ // mur_hash1 = mur_hash1 * 5 + N1
+ add mur_hash1, mur_hash1, mur_hash1, LSL #2
+ add mur_hash1, mur_n1, mur_hash1
+.endm
+
+.macro murmur3_03
+ eor mur_hash2, mur_hash2, mur_data2
+ /* rotate left by 31 bits */
+ ror mur_hash2, mur_hash2, #64-31
+ add mur_hash2, mur_hash2, mur_hash1
+ // mur_hash2 = mur_hash2 * 5 + N2
+ add mur_hash2, mur_hash2, mur_hash2, LSL #2
+ add mur_hash2, mur_n2, mur_hash2
+.endm
+
+/**
+ * maros for round 4-67
+ * the code execute 16 times per block, allowing the inserted murmur3 operation to process 256 bytes
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req
+ sha1h lane0_\tmp0\()_s, lane0_\abcd\()_s
+ sha1h lane1_\tmp0\()_s, lane1_\abcd\()_s
+ sha1h lane2_\tmp0\()_s, lane2_\abcd\()_s
+ sha1h lane3_\tmp0\()_s, lane3_\abcd\()_s
+ mov \e0\()_v.S[0],lane0_\tmp0\()_v.S[0]
+ mov \e0\()_v.S[1],lane1_\tmp0\()_v.S[0]
+ mov \e0\()_v.S[2],lane2_\tmp0\()_v.S[0]
+ mov \e0\()_v.S[3],lane3_\tmp0\()_v.S[0]
+ mov lane0_\tmp0\()_v.S[0],\e1\()_v.S[0]
+ mov lane1_\tmp0\()_v.S[0],\e1\()_v.S[1]
+ mov lane2_\tmp0\()_v.S[0],\e1\()_v.S[2]
+ mov lane3_\tmp0\()_v.S[0],\e1\()_v.S[3]
+ \inst lane0_\abcd\()_q,lane0_\tmp0\()_s,lane0_\tmp1\()_v.4s
+ murmur3_00
+ \inst lane1_\abcd\()_q,lane1_\tmp0\()_s,lane1_\tmp1\()_v.4s
+ murmur3_01
+ \inst lane2_\abcd\()_q,lane2_\tmp0\()_s,lane2_\tmp1\()_v.4s
+ murmur3_02
+ \inst lane3_\abcd\()_q,lane3_\tmp0\()_s,lane3_\tmp1\()_v.4s
+ murmur3_03
+ ld1 {lane0_\tmp0\()_v.4s-lane3_\tmp0\()_v.4s},[\tmp0\()_adr]
+ add lane0_\tmp1\()_v.4s,lane0_\msg3\()_v.4s,key_v.4s
+ add lane1_\tmp1\()_v.4s,lane1_\msg3\()_v.4s,key_v.4s
+ add lane2_\tmp1\()_v.4s,lane2_\msg3\()_v.4s,key_v.4s
+ add lane3_\tmp1\()_v.4s,lane3_\msg3\()_v.4s,key_v.4s
+ st1 {lane0_\tmp1\()_v.4s-lane3_\tmp1\()_v.4s},[\tmp1\()_adr]
+ sha1su1 lane0_\msg0\()_v.4s,lane0_\msg3\()_v.4s
+ sha1su1 lane1_\msg0\()_v.4s,lane1_\msg3\()_v.4s
+ sha1su1 lane2_\msg0\()_v.4s,lane2_\msg3\()_v.4s
+ sha1su1 lane3_\msg0\()_v.4s,lane3_\msg3\()_v.4s
+ sha1su0 lane0_\msg1\()_v.4s,lane0_\msg2\()_v.4s,lane0_\msg3\()_v.4s
+ sha1su0 lane1_\msg1\()_v.4s,lane1_\msg2\()_v.4s,lane1_\msg3\()_v.4s
+ sha1su0 lane2_\msg1\()_v.4s,lane2_\msg2\()_v.4s,lane2_\msg3\()_v.4s
+ sha1su0 lane3_\msg1\()_v.4s,lane3_\msg2\()_v.4s,lane3_\msg3\()_v.4s
+.endm
+
+
+/*
+ * void mh_sha1_murmur3_block_ce (const uint8_t * input_data,
+ * uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ * uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ * uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ * uint32_t num_blocks);
+ * arg 0 pointer to input data
+ * arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+ * arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+ * arg 3 pointer to murmur3 digest
+ * arg 4 number of 1KB blocks
+ */
+
+/*
+Arguements list
+*/
+ input_data .req x0
+ digests .req x1
+ frame_buffer .req x2
+ mur_digest .req x3
+ num_blocks .req w4
+
+ .global mh_sha1_murmur3_block_ce
+ .type mh_sha1_murmur3_block_ce, %function
+mh_sha1_murmur3_block_ce:
+ // save temp vector registers
+ stp d8, d9, [sp, -80]!
+
+ stp d10, d11, [sp, 16]
+ stp d12, d13, [sp, 32]
+ stp d14, d15, [sp, 48]
+ stp x19, x20, [sp, 64]
+
+ mov mur_data, input_data
+ ldr mur_hash1, [mur_digest]
+ ldr mur_hash2, [mur_digest, 8]
+ adr mur_c1, C1
+ ldr mur_c1, [mur_c1]
+ adr mur_c2, C2
+ ldr mur_c2, [mur_c2]
+ adr tmp0_adr, N1
+ ldr mur_n1_w, [tmp0_adr]
+ adr tmp0_adr, N2
+ ldr mur_n2_w, [tmp0_adr]
+
+ mov tmp0_adr,frame_buffer
+ add tmp1_adr,tmp0_adr,128
+
+
+start_loop:
+ mov block_cnt,0
+ mov msg_adr,input_data
+lane_loop:
+ mov offs,64
+ adr key_adr,KEY_0
+ // load msg 0
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[msg_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[msg_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[msg_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[msg_adr],offs
+
+ ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[msg_adr],offs
+ ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[1],[msg_adr],offs
+ ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[2],[msg_adr],offs
+ ld4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[3],[msg_adr],offs
+
+ ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[0],[msg_adr],offs
+ ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[1],[msg_adr],offs
+ ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[2],[msg_adr],offs
+ ld4 {lane0_msg_2_v.S-lane3_msg_2_v.S}[3],[msg_adr],offs
+
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[msg_adr],offs
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[1],[msg_adr],offs
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[2],[msg_adr],offs
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[3],[msg_adr],offs
+
+ add digest_adr,digests,block_cnt
+ ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
+ ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
+ ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
+ ld4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
+ ldr e0_q,[digest_adr]
+
+ // load key_0
+ ldr key_q,[key_adr]
+
+ rev32 lane0_msg_0_v.16b,lane0_msg_0_v.16b
+ rev32 lane1_msg_0_v.16b,lane1_msg_0_v.16b
+ rev32 lane2_msg_0_v.16b,lane2_msg_0_v.16b
+ rev32 lane3_msg_0_v.16b,lane3_msg_0_v.16b
+ rev32 lane0_msg_1_v.16b,lane0_msg_1_v.16b
+ rev32 lane1_msg_1_v.16b,lane1_msg_1_v.16b
+ rev32 lane2_msg_1_v.16b,lane2_msg_1_v.16b
+ rev32 lane3_msg_1_v.16b,lane3_msg_1_v.16b
+ rev32 lane0_msg_2_v.16b,lane0_msg_2_v.16b
+ rev32 lane1_msg_2_v.16b,lane1_msg_2_v.16b
+ rev32 lane2_msg_2_v.16b,lane2_msg_2_v.16b
+ rev32 lane3_msg_2_v.16b,lane3_msg_2_v.16b
+ rev32 lane0_msg_3_v.16b,lane0_msg_3_v.16b
+ rev32 lane1_msg_3_v.16b,lane1_msg_3_v.16b
+ rev32 lane2_msg_3_v.16b,lane2_msg_3_v.16b
+ rev32 lane3_msg_3_v.16b,lane3_msg_3_v.16b
+
+ add lane0_tmp1_v.4s,lane0_msg_1_v.4s,key_v.4s
+ add lane1_tmp1_v.4s,lane1_msg_1_v.4s,key_v.4s
+ add lane2_tmp1_v.4s,lane2_msg_1_v.4s,key_v.4s
+ add lane3_tmp1_v.4s,lane3_msg_1_v.4s,key_v.4s
+ st1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
+
+ add lane0_tmp0_v.4s,lane0_msg_0_v.4s,key_v.4s
+ add lane1_tmp0_v.4s,lane1_msg_0_v.4s,key_v.4s
+ add lane2_tmp0_v.4s,lane2_msg_0_v.4s,key_v.4s
+ add lane3_tmp0_v.4s,lane3_msg_0_v.4s,key_v.4s
+
+ /* rounds 0-3 */
+ sha1h lane0_tmp1_s,lane0_abcd_s
+ sha1h lane1_tmp1_s,lane1_abcd_s
+ sha1h lane2_tmp1_s,lane2_abcd_s
+ sha1h lane3_tmp1_s,lane3_abcd_s
+ mov e1_v.S[0],lane0_tmp1_v.S[0]
+ mov e1_v.S[1],lane1_tmp1_v.S[0]
+ mov e1_v.S[2],lane2_tmp1_v.S[0]
+ mov e1_v.S[3],lane3_tmp1_v.S[0]
+ mov lane0_tmp1_v.S[0],e0_v.S[0]
+ mov lane1_tmp1_v.S[0],e0_v.S[1]
+ mov lane2_tmp1_v.S[0],e0_v.S[2]
+ mov lane3_tmp1_v.S[0],e0_v.S[3]
+ sha1c lane0_abcd_q,lane0_tmp1_s,lane0_tmp0_v.4s
+ sha1c lane1_abcd_q,lane1_tmp1_s,lane1_tmp0_v.4s
+ sha1c lane2_abcd_q,lane2_tmp1_s,lane2_tmp0_v.4s
+ sha1c lane3_abcd_q,lane3_tmp1_s,lane3_tmp0_v.4s
+ ld1 {lane0_tmp1_v.4s-lane3_tmp1_v.4s},[tmp1_adr]
+ add lane0_tmp0_v.4s,lane0_msg_2_v.4s,key_v.4s
+ sha1su0 lane0_msg_0_v.4s,lane0_msg_1_v.4s,lane0_msg_2_v.4s
+ add lane1_tmp0_v.4s,lane1_msg_2_v.4s,key_v.4s
+ sha1su0 lane1_msg_0_v.4s,lane1_msg_1_v.4s,lane1_msg_2_v.4s
+ add lane2_tmp0_v.4s,lane2_msg_2_v.4s,key_v.4s
+ sha1su0 lane2_msg_0_v.4s,lane2_msg_1_v.4s,lane2_msg_2_v.4s
+ add lane3_tmp0_v.4s,lane3_msg_2_v.4s,key_v.4s
+ sha1su0 lane3_msg_0_v.4s,lane3_msg_1_v.4s,lane3_msg_2_v.4s
+ st1 {lane0_tmp0_v.4s-lane3_tmp0_v.4s},[tmp0_adr]
+
+ sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 4-7 */
+ sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+
+
+ adr key_adr,KEY_1
+ ldr key_q,[key_adr]
+ sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1 /* rounds 12-15 */
+ sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 20-23 */
+ sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+
+ adr key_adr,KEY_2
+ ldr key_q,[key_adr]
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 36-39 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+ sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+
+ adr key_adr,KEY_3
+ ldr key_q,[key_adr]
+ sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp0,e1,tmp1 /* rounds 52-55 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp1,e0,tmp0
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp0,e1,tmp1
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp1,e0,tmp0
+
+ // msg2 and msg1 are free
+ mov lane0_msg_2_v.S[0],e1_v.S[0]
+ mov lane1_msg_2_v.S[0],e1_v.S[1]
+ mov lane2_msg_2_v.S[0],e1_v.S[2]
+ mov lane3_msg_2_v.S[0],e1_v.S[3]
+
+ /* rounds 68-71 */
+ sha1h lane0_msg_1_s,lane0_abcd_s
+ sha1h lane1_msg_1_s,lane1_abcd_s
+ sha1h lane2_msg_1_s,lane2_abcd_s
+ sha1h lane3_msg_1_s,lane3_abcd_s
+ sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
+ sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
+ sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
+ sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
+ add lane0_tmp1_v.4s,lane0_msg_3_v.4s,key_v.4s
+ add lane1_tmp1_v.4s,lane1_msg_3_v.4s,key_v.4s
+ add lane2_tmp1_v.4s,lane2_msg_3_v.4s,key_v.4s
+ add lane3_tmp1_v.4s,lane3_msg_3_v.4s,key_v.4s
+ sha1su1 lane0_msg_0_v.4s,lane0_msg_3_v.4s
+ sha1su1 lane1_msg_0_v.4s,lane1_msg_3_v.4s
+ sha1su1 lane2_msg_0_v.4s,lane2_msg_3_v.4s
+ sha1su1 lane3_msg_0_v.4s,lane3_msg_3_v.4s
+
+ /* rounds 72-75 */
+ sha1h lane0_msg_2_s,lane0_abcd_s
+ sha1h lane1_msg_2_s,lane1_abcd_s
+ sha1h lane2_msg_2_s,lane2_abcd_s
+ sha1h lane3_msg_2_s,lane3_abcd_s
+ sha1p lane0_abcd_q,lane0_msg_1_s,lane0_tmp0_v.4s
+ sha1p lane1_abcd_q,lane1_msg_1_s,lane1_tmp0_v.4s
+ sha1p lane2_abcd_q,lane2_msg_1_s,lane2_tmp0_v.4s
+ sha1p lane3_abcd_q,lane3_msg_1_s,lane3_tmp0_v.4s
+
+ /* rounds 76-79 */
+ sha1h lane0_msg_1_s,lane0_abcd_s
+ sha1h lane1_msg_1_s,lane1_abcd_s
+ sha1h lane2_msg_1_s,lane2_abcd_s
+ sha1h lane3_msg_1_s,lane3_abcd_s
+ sha1p lane0_abcd_q,lane0_msg_2_s,lane0_tmp1_v.4s
+ sha1p lane1_abcd_q,lane1_msg_2_s,lane1_tmp1_v.4s
+ sha1p lane2_abcd_q,lane2_msg_2_s,lane2_tmp1_v.4s
+ sha1p lane3_abcd_q,lane3_msg_2_s,lane3_tmp1_v.4s
+ add digest_adr,digests,block_cnt
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[0],[digest_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[1],[digest_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[2],[digest_adr],offs
+ ld4 {lane0_msg_0_v.S-lane3_msg_0_v.S}[3],[digest_adr],offs
+ ld4 {lane0_msg_3_v.S-lane3_msg_3_v.S}[0],[digest_adr]
+
+ add lane0_abcd_v.4S,lane0_abcd_v.4S,lane0_msg_0_v.4S
+ add lane1_abcd_v.4S,lane1_abcd_v.4S,lane1_msg_0_v.4S
+ add lane2_abcd_v.4S,lane2_abcd_v.4S,lane2_msg_0_v.4S
+ add lane3_abcd_v.4S,lane3_abcd_v.4S,lane3_msg_0_v.4S
+
+ add lane0_msg_1_v.4S,lane0_msg_1_v.4S,lane0_msg_3_v.4S
+ add lane1_msg_1_v.4S,lane1_msg_1_v.4S,lane1_msg_3_v.4S
+ add lane2_msg_1_v.4S,lane2_msg_1_v.4S,lane2_msg_3_v.4S
+ add lane3_msg_1_v.4S,lane3_msg_1_v.4S,lane3_msg_3_v.4S
+
+ add digest_adr,digests,block_cnt
+ st4 {lane0_abcd_v.S-lane3_abcd_v.S}[0],[digest_adr],offs
+ st4 {lane0_abcd_v.S-lane3_abcd_v.S}[1],[digest_adr],offs
+ st4 {lane0_abcd_v.S-lane3_abcd_v.S}[2],[digest_adr],offs
+ st4 {lane0_abcd_v.S-lane3_abcd_v.S}[3],[digest_adr],offs
+ st4 {lane0_msg_1_v.S-lane3_msg_1_v.S}[0],[digest_adr]
+
+ add block_cnt,block_cnt,16
+ cmp block_cnt,64
+ add msg_adr,input_data,block_cnt
+ add digest_adr,digests,block_cnt
+ bcc lane_loop
+
+ subs num_blocks,num_blocks,1
+ add input_data,input_data,1024
+ bhi start_loop
+
+ /* save murmur-hash digest */
+ str mur_hash1, [mur_digest], #8
+ str mur_hash2, [mur_digest]
+
+exit_func:
+ // restore temp register
+ ldp d10, d11, [sp, 16]
+ ldp d12, d13, [sp, 32]
+ ldp d14, d15, [sp, 48]
+ ldp x19, x20, [sp, 64]
+ ldp d8, d9, [sp], 80
+ ret
+
+ .size mh_sha1_murmur3_block_ce, .-mh_sha1_murmur3_block_ce
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 4
+KEY_0:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+KEY_1:
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+KEY_2:
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+KEY_3:
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+
+N1:
+ .word 0x52dce729
+ .word 0x52dce729
+ .word 0x52dce729
+ .word 0x52dce729
+N2:
+ .word 0x38495ab5
+ .word 0x38495ab5
+ .word 0x38495ab5
+ .word 0x38495ab5
+
+C1:
+ .dword 0x87c37b91114253d5
+ .dword 0x87c37b91114253d5
+C2:
+ .dword 0x4cf5ad432745937f
+ .dword 0x4cf5ad432745937f
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c
new file mode 100644
index 000000000..4da674fba
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_ce.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_aarch64_internal.h"
+
+extern void mh_sha1_tail_ce(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t * frame_buffer,
+ uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+extern void mh_sha1_block_ce(const uint8_t * input_data,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+// mh_sha1_murmur3_update_ce.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_update_ce
+#define BLOCK_FUNCTION mh_sha1_murmur3_block_ce
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_finalize_ce.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_finalize_ce
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_ce
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S
new file mode 100644
index 000000000..051a6157e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/mh_sha1_murmur3_multibinary.S
@@ -0,0 +1,34 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+mbin_interface mh_sha1_murmur3_x64_128_update
+mbin_interface mh_sha1_murmur3_x64_128_finalize
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S
new file mode 100644
index 000000000..ccc66f41a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/aarch64/sha1_asimd_common.S
@@ -0,0 +1,271 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+ eor VF.16b, VC.16b, VD.16b
+ and VF.16b, VB.16b, VF.16b
+ eor VF.16b, VD.16b, VF.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+ eor VF.16b, VB.16b, VC.16b
+ eor VF.16b, VF.16b, VD.16b
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+ and vT0.16b, VB.16b, VC.16b
+ and vT1.16b, VB.16b, VD.16b
+ and vT2.16b, VC.16b, VD.16b
+ orr VF.16b, vT0.16b, vT1.16b
+ orr VF.16b, VF.16b, vT2.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+ FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+ .if \windex < 16
+ load_x4_word \windex
+ .endif
+.endm
+
+// FUNC_F0 is merged into STEP_00_15 for efficiency
+.macro SHA1_STEP_00_15_F0 windex:req
+ rev32 WORD\windex\().16b,WORD\windex\().16b
+ next_word=\windex+1
+ load_next_word %next_word
+ // e = (a leftrotate 5) + f + e + k + w[i]
+ ushr VT.4s, VA.4s, 32 - 5
+ add VE.4s, VE.4s, VK.4s
+ sli VT.4s, VA.4s, 5
+ eor VF.16b, VC.16b, VD.16b
+ add VE.4s, VE.4s, WORD\windex\().4s
+ and VF.16b, VB.16b, VF.16b
+ add VE.4s, VE.4s, VT.4s
+ eor VF.16b, VD.16b, VF.16b
+ ushr VT.4s, VB.4s, 32 - 30
+ add VE.4s, VE.4s, VF.4s
+ sli VT.4s, VB.4s, 30
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+ eor vT0.16b,\reg_3\().16b,\reg_8\().16b
+ eor VT.16b,\reg_14\().16b,\reg_16\().16b
+ sha1_step_16_79_interleave0 \windex
+ eor vT0.16b,vT0.16b,VT.16b
+ sha1_step_16_79_interleave1 \windex
+ // e = (a leftrotate 5) + f + e + k + w[i]
+ ushr VT.4s, vT0.4s, 32 - 1
+ add VE.4s, VE.4s, VK.4s
+ ushr vT1.4s, VA.4s, 32 - 5
+ sli VT.4s, vT0.4s, 1
+ add VE.4s, VE.4s, VT.4s
+ sli vT1.4s, VA.4s, 5
+ mov \reg_16\().16b,VT.16b
+ add VE.4s, VE.4s, vT1.4s
+ ushr VT.4s, VB.4s, 32 - 30
+ \func_f
+ add VE.4s, VE.4s, VF.4s
+ sli VT.4s, VB.4s, 30
+.endm
+
+ VA .req v0
+ VB .req v1
+ VC .req v2
+ VD .req v3
+ VE .req v4
+ VT .req v5
+ VF .req v6
+ VK .req v7
+ WORD0 .req v8
+ WORD1 .req v9
+ WORD2 .req v10
+ WORD3 .req v11
+ WORD4 .req v12
+ WORD5 .req v13
+ WORD6 .req v14
+ WORD7 .req v15
+ WORD8 .req v16
+ WORD9 .req v17
+ WORD10 .req v18
+ WORD11 .req v19
+ WORD12 .req v20
+ WORD13 .req v21
+ WORD14 .req v22
+ WORD15 .req v23
+ vT0 .req v24
+ vT1 .req v25
+ vT2 .req v26
+ vAA .req v27
+ vBB .req v28
+ vCC .req v29
+ vDD .req v30
+ vEE .req v31
+ TT .req v0
+ sha1key_adr .req x15
+
+.macro SWAP_STATES
+ // shifted VB is held in VT after each step
+ .unreq TT
+ TT .req VE
+ .unreq VE
+ VE .req VD
+ .unreq VD
+ VD .req VC
+ .unreq VC
+ VC .req VT
+ .unreq VT
+ VT .req VB
+ .unreq VB
+ VB .req VA
+ .unreq VA
+ VA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+ SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+ .if \windex <= 15
+ SHA1_STEP_00_15_F0 windex
+ .else
+ idx14=((\windex - 14) & 15)
+ idx8=((\windex - 8) & 15)
+ idx3=((\windex - 3) & 15)
+ idx16=(\windex & 15)
+ .if \windex <= 19
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 20 && \windex <= 39
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 40 && \windex <= 59
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 60 && \windex <= 79
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .endif
+
+ SWAP_STATES
+
+ .if \windex == 79
+ // after 80 steps, the registers ABCDET has shifted from
+ // its orignal order of 012345 to 341520
+ // have to swap back for both compile- and run-time correctness
+ mov v0.16b,v3.16b
+ .unreq VA
+ VA .req v0
+
+ mov vT0.16b,v2.16b
+ mov v2.16b,v1.16b
+ mov v1.16b,v4.16b
+ .unreq VB
+ VB .req v1
+ .unreq VC
+ VC .req v2
+
+ mov v3.16b,v5.16b
+ .unreq VD
+ VD .req v3
+
+ mov v4.16b,vT0.16b
+ .unreq VE
+ VE .req v4
+
+ .unreq VT
+ VT .req v5
+ .endif
+.endm
+
+.macro exec_steps idx:req,more:vararg
+ exec_step \idx
+ .ifnb \more
+ exec_steps \more
+ .endif
+.endm
+
+.macro sha1_single
+ load_x4_word 0
+
+ mov vAA.16B, VA.16B
+ mov vBB.16B, VB.16B
+ mov vCC.16B, VC.16B
+ mov vDD.16B, VD.16B
+ mov vEE.16B, VE.16B
+
+ adr sha1key_adr, KEY_0
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+ // 20 ~ 39
+ adr sha1key_adr, KEY_1
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+ // 40 ~ 59
+ adr sha1key_adr, KEY_2
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+ // 60 ~ 79
+ adr sha1key_adr, KEY_3
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+ add VA.4s, vAA.4s, VA.4s
+ add VB.4s, vBB.4s, VB.4s
+ add VC.4s, vCC.4s, VC.4s
+ add VD.4s, vDD.4s, VD.4s
+ add VE.4s, vEE.4s, VE.4s
+.endm
+
+.macro sha1_asimd_save_stack
+ stp d8,d9,[sp, -64]!
+ stp d10,d11,[sp, 16]
+ stp d12,d13,[sp, 32]
+ stp d14,d15,[sp, 48]
+.endm
+
+.macro sha1_asimd_restore_stack
+ ldp d10,d11,[sp, 16]
+ ldp d12,d13,[sp, 32]
+ ldp d14,d15,[sp, 48]
+ ldp d8,d9,[sp],64
+.endm
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c
new file mode 100644
index 000000000..518adb797
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c
@@ -0,0 +1,154 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+int mh_sha1_murmur3_x64_128_init(struct mh_sha1_murmur3_x64_128_ctx *ctx, uint64_t murmur_seed)
+{
+ uint64_t *murmur3_x64_128_hash;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint32_t i;
+
+ if (ctx == NULL)
+ return MH_SHA1_MURMUR3_CTX_ERROR_NULL;
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha1_segs_digests[0][i] = MH_SHA1_H0;
+ mh_sha1_segs_digests[1][i] = MH_SHA1_H1;
+ mh_sha1_segs_digests[2][i] = MH_SHA1_H2;
+ mh_sha1_segs_digests[3][i] = MH_SHA1_H3;
+ mh_sha1_segs_digests[4][i] = MH_SHA1_H4;
+ }
+
+ murmur3_x64_128_hash = (uint64_t *) ctx->murmur3_x64_128_digest;
+ murmur3_x64_128_hash[0] = murmur_seed;
+ murmur3_x64_128_hash[1] = murmur_seed;
+
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+}
+
+void mh_sha1_murmur3_x64_128_block_base(const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t
+ murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks)
+{
+
+ mh_sha1_block_base(input_data, mh_sha1_digests, frame_buffer, num_blocks);
+
+ murmur3_x64_128_block(input_data,
+ num_blocks * MH_SHA1_BLOCK_SIZE / MUR_BLOCK_SIZE,
+ murmur3_x64_128_digests);
+
+ return;
+}
+
+#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \
+ || defined( _M_X64) || defined(_M_IX86))
+/***************mh_sha1_murmur3_x64_128_update***********/
+// mh_sha1_murmur3_x64_128_update_sse.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_sse
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_sse
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_x64_128_update_avx.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_avx
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_x64_128_update_avx2.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx2
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_avx2
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+/***************mh_sha1_murmur3_x64_128_finalize***********/
+// mh_sha1_murmur3_x64_128_finalize_sse.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_sse
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_sse
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+// mh_sha1_murmur3_x64_128_finalize_avx.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+// mh_sha1_murmur3_x64_128_finalize_avx2.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx2
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx2
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver mh_sha1_murmur3_x64_128_init_slver_00000251;
+struct slver mh_sha1_murmur3_x64_128_init_slver = { 0x0251, 0x00, 0x00 };
+
+// mh_sha1_murmur3_x64_128_update version info
+struct slver mh_sha1_murmur3_x64_128_update_sse_slver_00000254;
+struct slver mh_sha1_murmur3_x64_128_update_sse_slver = { 0x0254, 0x00, 0x00 };
+
+struct slver mh_sha1_murmur3_x64_128_update_avx_slver_02000256;
+struct slver mh_sha1_murmur3_x64_128_update_avx_slver = { 0x0256, 0x00, 0x02 };
+
+struct slver mh_sha1_murmur3_x64_128_update_avx2_slver_04000258;
+struct slver mh_sha1_murmur3_x64_128_update_avx2_slver = { 0x0258, 0x00, 0x04 };
+
+// mh_sha1_murmur3_x64_128_finalize version info
+struct slver mh_sha1_murmur3_x64_128_finalize_sse_slver_00000255;
+struct slver mh_sha1_murmur3_x64_128_finalize_sse_slver = { 0x0255, 0x00, 0x00 };
+
+struct slver mh_sha1_murmur3_x64_128_finalize_avx_slver_02000257;
+struct slver mh_sha1_murmur3_x64_128_finalize_avx_slver = { 0x0257, 0x00, 0x02 };
+
+struct slver mh_sha1_murmur3_x64_128_finalize_avx2_slver_04000259;
+struct slver mh_sha1_murmur3_x64_128_finalize_avx2_slver = { 0x0259, 0x00, 0x04 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c
new file mode 100644
index 000000000..fbef1ac13
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c
@@ -0,0 +1,67 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+/***************mh_sha1_murmur3_x64_128_update***********/
+// mh_sha1_murmur3_x64_128_update_avx512.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx512
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_avx512
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+/***************mh_sha1_murmur3_x64_128_finalize***********/
+// mh_sha1_murmur3_x64_128_finalize_avx512.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx512
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx512
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// mh_sha1_murmur3_x64_128_update version info
+struct slver mh_sha1_murmur3_x64_128_update_avx512_slver_0600025c;
+struct slver mh_sha1_murmur3_x64_128_update_avx512_slver = { 0x025c, 0x00, 0x06 };
+
+// mh_sha1_murmur3_x64_128_finalize version info
+struct slver mh_sha1_murmur3_x64_128_finalize_avx512_slver_0600025d;
+struct slver mh_sha1_murmur3_x64_128_finalize_avx512_slver = { 0x025d, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c
new file mode 100644
index 000000000..28f15086d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_base_aliases.c
@@ -0,0 +1,43 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "mh_sha1_murmur3_x64_128_internal.h"
+#include <string.h>
+int mh_sha1_murmur3_x64_128_update(struct mh_sha1_murmur3_x64_128_ctx *ctx, const void *buffer,
+ uint32_t len)
+{
+ return mh_sha1_murmur3_x64_128_update_base(ctx, buffer, len);
+
+}
+
+int mh_sha1_murmur3_x64_128_finalize(struct mh_sha1_murmur3_x64_128_ctx *ctx,
+ void *mh_sha1_digest, void *murmur3_x64_128_digest)
+{
+ return mh_sha1_murmur3_x64_128_finalize_base(ctx, mh_sha1_digest,
+ murmur3_x64_128_digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm
new file mode 100644
index 000000000..4611494e0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm
@@ -0,0 +1,706 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-(%%imm))
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-(%%imm))
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[%%data + (%%memW * 16)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 256Byte.
+;; So insert 1 murmur block into every 4 SHA1_STEP_16_79.
+%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
+
+%macro SHA1_STEP_16_79_0 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ mov mur_data1, [mur_in_p]
+ mov mur_data2, [mur_in_p + 8]
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ imul mur_data1, mur_c1_r
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ imul mur_data2, mur_c2_r
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_1 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ rol mur_data1, R1
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ rol mur_data2, R2
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ imul mur_data1, mur_c2_r
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ imul mur_data2, mur_c1_r
+ PROLD %%regB,30, %%regT
+ add mur_in_p, 16
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_2 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ xor mur_hash1, mur_data1
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+ rol mur_hash1, R3
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ add mur_hash1, mur_hash2
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_3 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ xor mur_hash2, mur_data2
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+ rol mur_hash2, R4
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ add mur_hash2, mur_hash1
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8d
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbx
+ push rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10d
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ save_reg rbx, 10*16 + 6*8
+ save_reg rbp, 10*16 + 7*8
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbx, [rsp + 10*16 + 6*8]
+ mov rbp, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg4
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables of murmur3
+%define mur_in_p tmp2
+%define mur_digest_p arg3
+%define mur_hash1 tmp3
+%define mur_hash2 tmp4
+%define mur_data1 tmp5
+%define mur_data2 return
+%define mur_c1_r tmp6
+%define mur_c2_r arg5
+; constants of murmur3_x64_128
+%define R1 31
+%define R2 33
+%define R3 27
+%define R4 31
+%define M 5
+%define N1 0x52dce729;DWORD
+%define N2 0x38495ab5;DWORD
+%define C1 QWORD(0x87c37b91114253d5)
+%define C2 QWORD(0x4cf5ad432745937f)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp7
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+;void mh_sha1_murmur3_x64_128_block_avx (const uint8_t * input_data,
+; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+; uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number of 1KB blocks
+;
+mk_global mh_sha1_murmur3_x64_128_block_avx, function, internal
+func(mh_sha1_murmur3_x64_128_block_avx)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by avx
+ and rsp, ~0x0F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 5
+ VMOVPS A, [mh_digests_p + I*64 + 16*0]
+ VMOVPS B, [mh_digests_p + I*64 + 16*1]
+ VMOVPS C, [mh_digests_p + I*64 + 16*2]
+ VMOVPS D, [mh_digests_p + I*64 + 16*3]
+
+ vmovdqa [rsp + I*64 + 16*0], A
+ vmovdqa [rsp + I*64 + 16*1], B
+ vmovdqa [rsp + I*64 + 16*2], C
+ vmovdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+ ;init murmur variables
+ mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
+ ;load murmur hash digests and multiplier
+ mov mur_hash1, [mur_digest_p]
+ mov mur_hash2, [mur_digest_p + 8]
+ mov mur_c1_r, C1
+ mov mur_c2_r, C2
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+ VMOVPS T0,[mh_in_p + I*64+0*16]
+ VMOVPS T1,[mh_in_p + I*64+1*16]
+ VMOVPS T2,[mh_in_p + I*64+2*16]
+ VMOVPS T3,[mh_in_p + I*64+3*16]
+
+ vpshufb T0, F
+ vmovdqa [mh_data_p +(I)*16 +0*256],T0
+ vpshufb T1, F
+ vmovdqa [mh_data_p +(I)*16 +1*256],T1
+ vpshufb T2, F
+ vmovdqa [mh_data_p +(I)*16 +2*256],T2
+ vpshufb T3, F
+ vmovdqa [mh_data_p +(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ .segs_loop:
+ ;; Initialize digests
+ vmovdqa A, [rsp + 0*64 + mh_segs]
+ vmovdqa B, [rsp + 1*64 + mh_segs]
+ vmovdqa C, [rsp + 2*64 + mh_segs]
+ vmovdqa D, [rsp + 3*64 + mh_segs]
+ vmovdqa E, [rsp + 4*64 + mh_segs]
+
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+ vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ vpaddd A, AA
+ vpaddd B, BB
+ vpaddd C, CC
+ vpaddd D, DD
+ vpaddd E, EE
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], A
+ vmovdqa [rsp + 1*64 + mh_segs], B
+ vmovdqa [rsp + 2*64 + mh_segs], C
+ vmovdqa [rsp + 3*64 + mh_segs], D
+ vmovdqa [rsp + 4*64 + mh_segs], E
+
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ ;store murmur-hash digest
+ mov [mur_digest_p], mur_hash1
+ mov [mur_digest_p + 8], mur_hash2
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 5
+ vmovdqa A, [rsp + I*64 + 16*0]
+ vmovdqa B, [rsp + I*64 + 16*1]
+ vmovdqa C, [rsp + I*64 + 16*2]
+ vmovdqa D, [rsp + I*64 + 16*3]
+
+ VMOVPS [mh_digests_p + I*64 + 16*0], A
+ VMOVPS [mh_digests_p + I*64 + 16*1], B
+ VMOVPS [mh_digests_p + I*64 + 16*2], C
+ VMOVPS [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm
new file mode 100644
index 000000000..3fb440bf1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm
@@ -0,0 +1,653 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX2
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-%%imm)
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[%%data + (%%memW * 32)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 512Byte.
+;; So insert 1 murmur block into every 2 SHA1_STEP_16_79.
+%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
+
+%macro SHA1_STEP_16_79_0 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
+ mov mur_data1, [mur_in_p]
+ mov mur_data2, [mur_in_p + 8]
+
+ vpsrld %%regF, W16, (32-1)
+ imul mur_data1, mur_c1_r
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ imul mur_data2, mur_c2_r
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
+ rol mur_data1, R1
+ vpaddd %%regE, %%regE,%%regF
+ rol mur_data2, R2
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ imul mur_data1, mur_c2_r
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ imul mur_data2, mur_c1_r
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+
+%macro SHA1_STEP_16_79_1 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ xor mur_hash1, mur_data1
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
+ rol mur_hash1, R3
+ vpxor W16, W16, W14
+ add mur_hash1, mur_hash2
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
+ lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ xor mur_hash2, mur_data2
+ vpor %%regF, %%regF, W16
+ rol mur_hash2, R4
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+ add mur_hash2, mur_hash1
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ add mur_in_p, 16
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8d
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbx
+ push rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10d
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ save_reg rbx, 10*16 + 6*8
+ save_reg rbp, 10*16 + 7*8
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbx, [rsp + 10*16 + 6*8]
+ mov rbp, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg4
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables of murmur3
+%define mur_in_p tmp2
+%define mur_digest_p arg3
+%define mur_hash1 tmp3
+%define mur_hash2 tmp4
+%define mur_data1 tmp5
+%define mur_data2 return
+%define mur_c1_r tmp6
+%define mur_c2_r arg5
+; constants of murmur3_x64_128
+%define R1 31
+%define R2 33
+%define R3 27
+%define R4 31
+%define M 5
+%define N1 0x52dce729;DWORD
+%define N2 0x38495ab5;DWORD
+%define C1 QWORD(0x87c37b91114253d5)
+%define C2 QWORD(0x4cf5ad432745937f)
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp7
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+
+%define pref tmp8
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+
+%define F ymm5
+%define T0 ymm6
+%define T1 ymm7
+%define T2 ymm8
+%define T3 ymm9
+%define T4 ymm10
+%define T5 ymm11
+%define T6 ymm12
+%define T7 ymm13
+%define T8 ymm14
+%define T9 ymm15
+
+%define AA ymm5
+%define BB ymm6
+%define CC ymm7
+%define DD ymm8
+%define EE ymm9
+%define TMP ymm10
+%define FUN ymm11
+%define K ymm12
+%define W14 ymm13
+%define W15 ymm14
+%define W16 ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+;void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data,
+; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+; uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number of 1KB blocks
+;
+mk_global mh_sha1_murmur3_x64_128_block_avx2, function, internal
+func(mh_sha1_murmur3_x64_128_block_avx2)
+ endbranch
+ FUNC_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 32 Bytes needed by avx2
+ and rsp, ~0x1F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 2
+ VMOVPS A, [mh_digests_p + I*32*5 + 32*0]
+ VMOVPS B, [mh_digests_p + I*32*5 + 32*1]
+ VMOVPS C, [mh_digests_p + I*32*5 + 32*2]
+ VMOVPS D, [mh_digests_p + I*32*5 + 32*3]
+ VMOVPS E, [mh_digests_p + I*32*5 + 32*4]
+
+ vmovdqa [rsp + I*32*5 + 32*0], A
+ vmovdqa [rsp + I*32*5 + 32*1], B
+ vmovdqa [rsp + I*32*5 + 32*2], C
+ vmovdqa [rsp + I*32*5 + 32*3], D
+ vmovdqa [rsp + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+
+ ;init murmur variables
+ mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
+ ;load murmur hash digests and multiplier
+ mov mur_hash1, [mur_digest_p]
+ mov mur_hash2, [mur_digest_p + 8]
+ mov mur_c1_r, C1
+ mov mur_c2_r, C2
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
+%assign I 0
+%rep 16
+ VMOVPS T0,[mh_in_p + I*64+0*32]
+ VMOVPS T1,[mh_in_p + I*64+1*32]
+
+ vpshufb T0, T0, F
+ vmovdqa [mh_data_p +I*32+0*512],T0
+ vpshufb T1, T1, F
+ vmovdqa [mh_data_p +I*32+1*512],T1
+%assign I (I+1)
+%endrep
+
+ mov mh_segs, 0 ;start from the first 8 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ ;; Initialize digests
+ vmovdqa A, [rsp + 0*64 + mh_segs]
+ vmovdqa B, [rsp + 1*64 + mh_segs]
+ vmovdqa C, [rsp + 2*64 + mh_segs]
+ vmovdqa D, [rsp + 3*64 + mh_segs]
+ vmovdqa E, [rsp + 4*64 + mh_segs]
+
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 32]
+ vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 32]
+ %rep 4
+ %assign J (I % 2)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+ PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+ %rep 20
+ %assign J (I % 2)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+ %rep 20
+ %assign J (I % 2)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*2]
+ PREFETCH_X [mh_in_p + pref+128*3]
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+ %rep 20
+ %assign J (I % 2)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ vpaddd A,A, AA
+ vpaddd B,B, BB
+ vpaddd C,C, CC
+ vpaddd D,D, DD
+ vpaddd E,E, EE
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], A
+ vmovdqa [rsp + 1*64 + mh_segs], B
+ vmovdqa [rsp + 2*64 + mh_segs], C
+ vmovdqa [rsp + 3*64 + mh_segs], D
+ vmovdqa [rsp + 4*64 + mh_segs], E
+
+ add pref, 512
+
+ add mh_data_p, 512
+ add mh_segs, 32
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ ;store murmur-hash digest
+ mov [mur_digest_p], mur_hash1
+ mov [mur_digest_p + 8], mur_hash2
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 2
+ vmovdqa A, [rsp + I*32*5 + 32*0]
+ vmovdqa B, [rsp + I*32*5 + 32*1]
+ vmovdqa C, [rsp + I*32*5 + 32*2]
+ vmovdqa D, [rsp + I*32*5 + 32*3]
+ vmovdqa E, [rsp + I*32*5 + 32*4]
+
+ VMOVPS [mh_digests_p + I*32*5 + 32*0], A
+ VMOVPS [mh_digests_p + I*32*5 + 32*1], B
+ VMOVPS [mh_digests_p + I*32*5 + 32*2], C
+ VMOVPS [mh_digests_p + I*32*5 + 32*3], D
+ VMOVPS [mh_digests_p + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=32
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
new file mode 100644
index 000000000..a5c157078
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
@@ -0,0 +1,504 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovdqu64
+;SIMD variables definition
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define HH0 zmm5
+%define HH1 zmm6
+%define HH2 zmm7
+%define HH3 zmm8
+%define HH4 zmm9
+%define KT zmm10
+%define XTMP0 zmm11
+%define XTMP1 zmm12
+%define SHUF_MASK zmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;using extra 16 ZMM registers to place the inverse input data
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;macros definition
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ vmovdqa32 XTMP1, B ; Copy B
+ vpaddd E, E, %%WT ; E = E + Wt
+ vpternlogd XTMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ vprold XTMP0, A, 5 ; TMP0 = ROTL_5(A)
+ vpaddd E, E, XTMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ vprold B, B, 30 ; B = ROTL_30(B)
+ vpaddd E, E, XTMP0 ; E = T
+
+ ROTATE_ARGS
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls PROCESS_LOOP 80 and
+;; MSG_SCHED_ROUND_16_79 64 times and processes 1024 Bytes.
+;; So insert 1 murmur block per section_loop.
+%macro PROCESS_LOOP_MUR 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ mov mur_data1, [mur_in_p]
+ mov mur_data2, [mur_in_p + 8]
+ vmovdqa32 XTMP1, B ; Copy B
+ imul mur_data1, mur_c1_r
+ imul mur_data2, mur_c2_r
+ vpaddd E, E, %%WT ; E = E + Wt
+ rol mur_data1, R1
+ rol mur_data2, R2
+ vpternlogd XTMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ imul mur_data1, mur_c2_r
+ imul mur_data2, mur_c1_r
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ xor mur_hash1, mur_data1
+ add mur_in_p, 16
+ vprold XTMP0, A, 5 ; TMP0 = ROTL_5(A)
+ rol mur_hash1, R3
+ vpaddd E, E, XTMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ add mur_hash1, mur_hash2
+ vprold B, B, 30 ; B = ROTL_30(B)
+ lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+ vpaddd E, E, XTMP0 ; E = T
+ xor mur_hash2, mur_data2
+
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79_MUR 4
+%define %%WT %1
+%define %%WTp2 %2
+%define %%WTp8 %3
+%define %%WTp13 %4
+ ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+ ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+ vpternlogd %%WT, %%WTp2, %%WTp8, 0x96
+ rol mur_hash2, R4
+ vpxord %%WT, %%WT, %%WTp13
+ add mur_hash2, mur_hash1
+ lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+ vprold %%WT, %%WT, 1
+%endmacro
+
+%define APPEND(a,b) a %+ b
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8d
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbx
+ push rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10d
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ ; remove unwind info macros
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ movdqa [rsp + 0*16], xmm6
+ movdqa [rsp + 1*16], xmm7
+ movdqa [rsp + 2*16], xmm8
+ movdqa [rsp + 3*16], xmm9
+ movdqa [rsp + 4*16], xmm10
+ movdqa [rsp + 5*16], xmm11
+ movdqa [rsp + 6*16], xmm12
+ movdqa [rsp + 7*16], xmm13
+ movdqa [rsp + 8*16], xmm14
+ movdqa [rsp + 9*16], xmm15
+ mov [rsp + 10*16 + 0*8], r12
+ mov [rsp + 10*16 + 1*8], r13
+ mov [rsp + 10*16 + 2*8], r14
+ mov [rsp + 10*16 + 3*8], r15
+ mov [rsp + 10*16 + 4*8], rdi
+ mov [rsp + 10*16 + 5*8], rsi
+ mov [rsp + 10*16 + 6*8], rbx
+ mov [rsp + 10*16 + 7*8], rbp
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbx, [rsp + 10*16 + 6*8]
+ mov rbp, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg4
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables of murmur3
+%define mur_in_p tmp2
+%define mur_digest_p arg3
+%define mur_hash1 tmp3
+%define mur_hash2 tmp4
+%define mur_data1 tmp5
+%define mur_data2 return
+%define mur_c1_r tmp6
+%define mur_c2_r arg5
+; constants of murmur3_x64_128
+%define R1 31
+%define R2 33
+%define R3 27
+%define R4 31
+%define M 5
+%define N1 0x52dce729;DWORD
+%define N2 0x38495ab5;DWORD
+%define C1 QWORD(0x87c37b91114253d5)
+%define C2 QWORD(0x4cf5ad432745937f)
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp7
+
+%define pref tmp8
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha1_murmur3_x64_128_block_avx512 (const uint8_t * input_data,
+; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+; uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number of 1KB blocks
+;
+global mh_sha1_murmur3_x64_128_block_avx512
+func(mh_sha1_murmur3_x64_128_block_avx512)
+ endbranch
+ FUNC_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; align rsp to 64 Bytes needed by avx512
+ and rsp, ~0x3f
+
+ ; copy segs_digests into registers.
+ VMOVPS HH0, [mh_digests_p + 64*0]
+ VMOVPS HH1, [mh_digests_p + 64*1]
+ VMOVPS HH2, [mh_digests_p + 64*2]
+ VMOVPS HH3, [mh_digests_p + 64*3]
+ VMOVPS HH4, [mh_digests_p + 64*4]
+ ;a mask used to transform to big-endian data
+ vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ ;init murmur variables
+ mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
+ ;load murmur hash digests and multiplier
+ mov mur_hash1, [mur_digest_p]
+ mov mur_hash2, [mur_digest_p + 8]
+ mov mur_c1_r, C1
+ mov mur_c2_r, C2
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ ;using extra 16 ZMM registers instead of stack
+%assign I 0
+%rep 8
+%assign J (I+1)
+ VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64]
+ VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64]
+
+ vpshufb APPEND(W,I), APPEND(W,I), SHUF_MASK
+ vpshufb APPEND(W,J), APPEND(W,J), SHUF_MASK
+%assign I (I+2)
+%endrep
+
+ vmovdqa64 A, HH0
+ vmovdqa64 B, HH1
+ vmovdqa64 C, HH2
+ vmovdqa64 D, HH3
+ vmovdqa64 E, HH4
+
+ vmovdqa32 KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 80
+ %if N < 64 ; stitching 64 times
+ PROCESS_LOOP_MUR APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79_MUR APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %else ; 64 <= N < 80, without stitching
+ PROCESS_LOOP APPEND(W,J), I
+ %endif
+ %if N = 19
+ vmovdqa32 KT, [K20_39]
+ %assign I 0x96
+ %elif N = 39
+ vmovdqa32 KT, [K40_59]
+ %assign I 0xE8
+ %elif N = 59
+ vmovdqa32 KT, [K60_79]
+ %assign I 0x96
+ %endif
+ %if N % 20 = 19
+ PREFETCH_X [mh_in_p + 1024+128*(N / 20)]
+ PREFETCH_X [mh_in_p + 1024+128*(N / 20 +1)]
+ %endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+ ; Add old digest
+ vpaddd HH0,A, HH0
+ vpaddd HH1,B, HH1
+ vpaddd HH2,C, HH2
+ vpaddd HH3,D, HH3
+ vpaddd HH4,E, HH4
+
+ add mh_in_p, 1024
+ sub loops, 1
+ jne .block_loop
+
+ ;store murmur-hash digest
+ mov [mur_digest_p], mur_hash1
+ mov [mur_digest_p + 8], mur_hash2
+
+ ; copy segs_digests to mh_digests_p
+ VMOVPS [mh_digests_p + 64*0], HH0
+ VMOVPS [mh_digests_p + 64*1], HH1
+ VMOVPS [mh_digests_p + 64*2], HH2
+ VMOVPS [mh_digests_p + 64*3], HH3
+ VMOVPS [mh_digests_p + 64*4], HH4
+
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+
+section .data align=64
+
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+
+K20_39: dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+
+K40_59: dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+
+K60_79: dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_murmur3_x64_128_block_avx512
+no_sha1_murmur3_x64_128_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm
new file mode 100644
index 000000000..ebd1b8b49
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm
@@ -0,0 +1,702 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using SSE
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regC
+ pxor %%regF,%%regD
+ pand %%regF,%%regB
+ pxor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regD
+ pxor %%regF,%%regC
+ pxor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regB
+ movdqa %%regT,%%regB
+ por %%regF,%%regC
+ pand %%regT,%%regC
+ pand %%regF,%%regD
+ por %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ pslld %%reg, %%imm
+ psrld %%tmp, (32-%%imm)
+ por %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ paddd %%regE,[%%data + (%%memW * 16)]
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 256Byte.
+;; So insert 1 murmur block into every 4 SHA1_STEP_16_79.
+%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
+
+%macro SHA1_STEP_16_79_0 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ mov mur_data1, [mur_in_p]
+ mov mur_data2, [mur_in_p + 8]
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ imul mur_data1, mur_c1_r
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ imul mur_data2, mur_c2_r
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_1 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ rol mur_data1, R1
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ rol mur_data2, R2
+ psrld %%regF, (32-1)
+ por %%regF, W16
+
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ imul mur_data1, mur_c2_r
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ imul mur_data2, mur_c1_r
+ PROLD %%regB,30, %%regT
+ add mur_in_p, 16
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_2 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ xor mur_hash1, mur_data1
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ rol mur_hash1, R3
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ add mur_hash1, mur_hash2
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_3 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ xor mur_hash2, mur_data2
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ rol mur_hash2, R4
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ add mur_hash2, mur_hash1
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+ paddd %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8d
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbx
+ push rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10d
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ save_reg rbx, 10*16 + 6*8
+ save_reg rbp, 10*16 + 7*8
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbx, [rsp + 10*16 + 6*8]
+ mov rbp, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg4
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables of murmur3
+%define mur_in_p tmp2
+%define mur_digest_p arg3
+%define mur_hash1 tmp3
+%define mur_hash2 tmp4
+%define mur_data1 tmp5
+%define mur_data2 return
+%define mur_c1_r tmp6
+%define mur_c2_r arg5
+; constants of murmur3_x64_128
+%define R1 31
+%define R2 33
+%define R3 27
+%define R4 31
+%define M 5
+%define N1 0x52dce729;DWORD
+%define N2 0x38495ab5;DWORD
+%define C1 QWORD(0x87c37b91114253d5)
+%define C2 QWORD(0x4cf5ad432745937f)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp7
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define MOVPS movups
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+;void mh_sha1_murmur3_x64_128_block_sse (const uint8_t * input_data,
+; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+; uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number of 1KB blocks
+;
+mk_global mh_sha1_murmur3_x64_128_block_sse, function, internal
+func(mh_sha1_murmur3_x64_128_block_sse)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by sse
+ and rsp, ~0x0F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 5
+ MOVPS A, [mh_digests_p + I*64 + 16*0]
+ MOVPS B, [mh_digests_p + I*64 + 16*1]
+ MOVPS C, [mh_digests_p + I*64 + 16*2]
+ MOVPS D, [mh_digests_p + I*64 + 16*3]
+
+ movdqa [rsp + I*64 + 16*0], A
+ movdqa [rsp + I*64 + 16*1], B
+ movdqa [rsp + I*64 + 16*2], C
+ movdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+ ;init murmur variables
+ mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
+ ;load murmur hash digests and multiplier
+ mov mur_hash1, [mur_digest_p]
+ mov mur_hash2, [mur_digest_p + 8]
+ mov mur_c1_r, C1
+ mov mur_c2_r, C2
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+ MOVPS T0,[mh_in_p+I*64+0*16]
+ MOVPS T1,[mh_in_p+I*64+1*16]
+ MOVPS T2,[mh_in_p+I*64+2*16]
+ MOVPS T3,[mh_in_p+I*64+3*16]
+
+ pshufb T0, F
+ movdqa [mh_data_p+(I)*16 +0*256],T0
+ pshufb T1, F
+ movdqa [mh_data_p+(I)*16 +1*256],T1
+ pshufb T2, F
+ movdqa [mh_data_p+(I)*16 +2*256],T2
+ pshufb T3, F
+ movdqa [mh_data_p+(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ .segs_loop:
+ ;; Initialize digests
+ movdqa A, [rsp + 0*64 + mh_segs]
+ movdqa B, [rsp + 1*64 + mh_segs]
+ movdqa C, [rsp + 2*64 + mh_segs]
+ movdqa D, [rsp + 3*64 + mh_segs]
+ movdqa E, [rsp + 4*64 + mh_segs]
+
+ movdqa AA, A
+ movdqa BB, B
+ movdqa CC, C
+ movdqa DD, D
+ movdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ movdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+ movdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+ movdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 20...39
+ movdqa K, [K20_39]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+ movdqa K, [K40_59]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 60...79
+ movdqa K, [K60_79]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ paddd A, AA
+ paddd B, BB
+ paddd C, CC
+ paddd D, DD
+ paddd E, EE
+
+ ; write out digests
+ movdqa [rsp + 0*64 + mh_segs], A
+ movdqa [rsp + 1*64 + mh_segs], B
+ movdqa [rsp + 2*64 + mh_segs], C
+ movdqa [rsp + 3*64 + mh_segs], D
+ movdqa [rsp + 4*64 + mh_segs], E
+
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ ;store murmur-hash digest
+ mov [mur_digest_p], mur_hash1
+ mov [mur_digest_p + 8], mur_hash2
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 5
+ movdqa A, [rsp + I*64 + 16*0]
+ movdqa B, [rsp + I*64 + 16*1]
+ movdqa C, [rsp + I*64 + 16*2]
+ movdqa D, [rsp + I*64 + 16*3]
+
+ MOVPS [mh_digests_p + I*64 + 16*0], A
+ MOVPS [mh_digests_p + I*64 + 16*1], B
+ MOVPS [mh_digests_p + I*64 + 16*2], C
+ MOVPS [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c
new file mode 100644
index 000000000..4d09abf1d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c
@@ -0,0 +1,102 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef FINALIZE_FUNCTION
+#include <stdlib.h> // For NULL
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_base
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_base
+#define FINALIZE_FUNCTION_SLVER
+#endif
+
+#define MURMUR_BLOCK_FUNCTION murmur3_x64_128_block
+#define MURMUR_TAIL_FUNCTION murmur3_x64_128_tail
+
+int FINALIZE_FUNCTION(struct mh_sha1_murmur3_x64_128_ctx *ctx, void *mh_sha1_digest,
+ void *murmur3_x64_128_digest)
+{
+ uint8_t *partial_block_buffer, *murmur_tail_data;
+ uint64_t partial_block_len, total_len;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+
+ if (ctx == NULL)
+ return MH_SHA1_MURMUR3_CTX_ERROR_NULL;
+
+ total_len = ctx->total_length;
+ partial_block_len = total_len % MH_SHA1_BLOCK_SIZE;
+ partial_block_buffer = ctx->partial_block_buffer;
+
+ // Calculate murmur3 firstly
+ // because mh_sha1 will change the partial_block_buffer
+ // ( partial_block_buffer = n murmur3 blocks and 1 murmur3 tail)
+ murmur_tail_data =
+ partial_block_buffer + partial_block_len - partial_block_len % MUR_BLOCK_SIZE;
+ MURMUR_BLOCK_FUNCTION(partial_block_buffer, partial_block_len / MUR_BLOCK_SIZE,
+ ctx->murmur3_x64_128_digest);
+ MURMUR_TAIL_FUNCTION(murmur_tail_data, total_len, ctx->murmur3_x64_128_digest);
+
+ /* mh_sha1 final */
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+
+ MH_SHA1_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha1_segs_digests,
+ aligned_frame_buffer, ctx->mh_sha1_digest);
+
+ /* Output the digests of murmur3 and mh_sha1 */
+ if (mh_sha1_digest != NULL) {
+ ((uint32_t *) mh_sha1_digest)[0] = ctx->mh_sha1_digest[0];
+ ((uint32_t *) mh_sha1_digest)[1] = ctx->mh_sha1_digest[1];
+ ((uint32_t *) mh_sha1_digest)[2] = ctx->mh_sha1_digest[2];
+ ((uint32_t *) mh_sha1_digest)[3] = ctx->mh_sha1_digest[3];
+ ((uint32_t *) mh_sha1_digest)[4] = ctx->mh_sha1_digest[4];
+ }
+
+ if (murmur3_x64_128_digest != NULL) {
+ ((uint32_t *) murmur3_x64_128_digest)[0] = ctx->murmur3_x64_128_digest[0];
+ ((uint32_t *) murmur3_x64_128_digest)[1] = ctx->murmur3_x64_128_digest[1];
+ ((uint32_t *) murmur3_x64_128_digest)[2] = ctx->murmur3_x64_128_digest[2];
+ ((uint32_t *) murmur3_x64_128_digest)[3] = ctx->murmur3_x64_128_digest[3];
+ }
+
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+}
+
+#ifdef FINALIZE_FUNCTION_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+ // Version info
+struct slver mh_sha1_murmur3_x64_128_finalize_base_slver_0000025b;
+struct slver mh_sha1_murmur3_x64_128_finalize_base_slver = { 0x025b, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h
new file mode 100644
index 000000000..e77837347
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h
@@ -0,0 +1,202 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_MURMUR3_X64_128_INTERNAL_H_
+#define _MH_SHA1_MURMUR3_X64_128_INTERNAL_H_
+
+/**
+ * @file mh_sha1_murmur3_x64_128_internal.h
+ * @brief mh_sha1_murmur3_x64_128 internal function prototypes and macros
+ *
+ * Interface for mh_sha1_murmur3_x64_128 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha1_internal.h"
+#include "mh_sha1_murmur3_x64_128.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+ /*******************************************************************
+ * mh_sha1_murmur3_x64_128 API internal function prototypes
+ * Multiple versions of Update and Finalize functions are supplied which use
+ * multiple versions of block and tail process subfunctions.
+ ******************************************************************/
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ // Each function needs an individual C or ASM file because they impact performance much.
+ //They will be called by mh_sha1_murmur3_x64_128_update_XXX.
+ void mh_sha1_murmur3_x64_128_block (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_base (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires SSE
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_sse (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires AVX
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_avx (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires AVX2
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires AVX512
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_avx512 (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+ /*******************************************************************
+ * murmur hash API
+ ******************************************************************/
+
+ /**
+ * @brief Calculate murmur digest of blocks which size is 16*N.
+ * @param input_data Pointer to input data to be processed
+ * @param num_blocks The number of blocks which size is 16.
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @returns none
+ *
+ */
+ void murmur3_x64_128_block(const uint8_t * input_data, uint32_t num_blocks,
+ uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]);
+
+ /**
+ * @brief Do the tail process which is less than 16Byte.
+ * @param tail_buffer Pointer to input data to be processed
+ * @param total_len The total length of the input_data
+ * @param digests Murmur3 digest
+ * @returns none
+ *
+ */
+ void murmur3_x64_128_tail(const uint8_t * tail_buffer, uint32_t total_len,
+ uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm
new file mode 100644
index 000000000..6f9e54cdd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm
@@ -0,0 +1,76 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ [bits 32]
+%else
+ default rel
+ [bits 64]
+
+ extern mh_sha1_murmur3_x64_128_update_sse
+ extern mh_sha1_murmur3_x64_128_update_avx
+ extern mh_sha1_murmur3_x64_128_update_avx2
+ extern mh_sha1_murmur3_x64_128_finalize_sse
+ extern mh_sha1_murmur3_x64_128_finalize_avx
+ extern mh_sha1_murmur3_x64_128_finalize_avx2
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ extern mh_sha1_murmur3_x64_128_update_avx512
+ extern mh_sha1_murmur3_x64_128_finalize_avx512
+ %endif
+
+%endif
+
+extern mh_sha1_murmur3_x64_128_update_base
+extern mh_sha1_murmur3_x64_128_finalize_base
+
+mbin_interface mh_sha1_murmur3_x64_128_update
+mbin_interface mh_sha1_murmur3_x64_128_finalize
+
+%ifidn __OUTPUT_FORMAT__, elf64
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base, mh_sha1_murmur3_x64_128_update_sse, mh_sha1_murmur3_x64_128_update_avx, mh_sha1_murmur3_x64_128_update_avx2, mh_sha1_murmur3_x64_128_update_avx512
+ mbin_dispatch_init6 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base, mh_sha1_murmur3_x64_128_finalize_sse, mh_sha1_murmur3_x64_128_finalize_avx, mh_sha1_murmur3_x64_128_finalize_avx2, mh_sha1_murmur3_x64_128_finalize_avx512
+ %else
+ mbin_dispatch_init5 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base, mh_sha1_murmur3_x64_128_update_sse, mh_sha1_murmur3_x64_128_update_avx, mh_sha1_murmur3_x64_128_update_avx2
+ mbin_dispatch_init5 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base, mh_sha1_murmur3_x64_128_finalize_sse, mh_sha1_murmur3_x64_128_finalize_avx, mh_sha1_murmur3_x64_128_finalize_avx2
+ %endif
+
+%else
+ mbin_dispatch_init2 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base
+ mbin_dispatch_init2 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base
+%endif
+
+;;; func core, ver, snum
+slversion mh_sha1_murmur3_x64_128_update, 00, 02, 0252
+slversion mh_sha1_murmur3_x64_128_finalize, 00, 02, 0253
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c
new file mode 100644
index 000000000..77ebb964e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c
@@ -0,0 +1,206 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1_murmur3_x64_128.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_LEN 16*1024
+# define TEST_LOOPS 20000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define TEST_LEN 32*1024*1024
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#define TEST_MEM TEST_LEN
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \
+ printf("The stitch function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * murmur3_x64_128_digest);
+
+void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest)
+{
+ mh_sha1_ref(buffer, len, mh_sha1_digest);
+ murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest);
+
+ return;
+}
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS],
+ uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+ int murmur3_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) {
+ if (murmur3_test[i] != murmur3_base[i])
+ murmur3_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+ if (murmur3_fail) {
+ printf("murmur3 fail test\n");
+ printf("base: ");
+ dump((char *)murmur3_base, 16);
+ printf("ref: ");
+ dump((char *)murmur3_test, 16);
+ }
+
+ return mh_sha1_fail + murmur3_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, fail = 0;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS],
+ murmur3_base[MURMUR3_x64_128_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL;
+ struct perf start, stop;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n");
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ // mh_sha1_murmur3 base version
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base);
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS / 10; i++) {
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+ murmur3_base);
+ }
+ perf_stop(&stop);
+ printf("mh_sha1_murmur3_x64_128_base" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ //Update feature test
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+ }
+ perf_stop(&stop);
+ printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ // Check results
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", TEST_LEN);
+ return -1;
+ }
+
+ if (fail)
+ printf("Test failed function test%d\n", fail);
+ else
+ printf("Pass func check\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c
new file mode 100644
index 000000000..22ab6d1f9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c
@@ -0,0 +1,248 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1_murmur3_x64_128.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \
+ printf("The stitch function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * murmur3_x64_128_digest);
+
+void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest)
+{
+ mh_sha1_ref(buffer, len, mh_sha1_digest);
+ murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest);
+
+ return;
+}
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS],
+ uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+ int murmur3_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) {
+ if (murmur3_test[i] != murmur3_base[i])
+ murmur3_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+ if (murmur3_fail) {
+ printf("murmur3 fail test\n");
+ printf("base: ");
+ dump((char *)murmur3_base, 16);
+ printf("ref: ");
+ dump((char *)murmur3_test, 16);
+ }
+
+ return mh_sha1_fail + murmur3_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS],
+ murmur3_base[MURMUR3_x64_128_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int size, offset;
+ struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL;
+
+ printf(" " xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages
+ for (size = TEST_LEN; size >= 0; size--) {
+
+ // Fill with rand data
+ rand_buffer(buff, size);
+
+ mh_sha1_murmur3_x64_128_base(buff, size, TEST_SEED, hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various buffer offsets and sizes
+ printf("offset tests");
+ for (size = TEST_LEN - 256; size > 256; size -= 11) {
+ for (offset = 0; offset < 256; offset++) {
+ mh_sha1_murmur3_x64_128_base(buff + offset, size, TEST_SEED,
+ hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail =
+ compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size=%d offset=%d\n", size, offset);
+ return -1;
+ }
+
+ }
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Run efence tests
+ printf("efence tests");
+ for (size = TEST_SIZE; size > 0; size--) {
+ offset = TEST_LEN - size;
+ mh_sha1_murmur3_x64_128_base(buff + offset, size, TEST_SEED,
+ hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size=%d offset=%d\n", size, offset);
+ return -1;
+ }
+
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c
new file mode 100644
index 000000000..0e7a3970d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c
@@ -0,0 +1,107 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef UPDATE_FUNCTION
+#include "mh_sha1_murmur3_x64_128_internal.h"
+#include <string.h>
+
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_base
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_base
+#define UPDATE_FUNCTION_SLVER
+#endif
+
+int UPDATE_FUNCTION(struct mh_sha1_murmur3_x64_128_ctx *ctx, const void *buffer, uint32_t len)
+{
+
+ uint8_t *partial_block_buffer;
+ uint64_t partial_block_len;
+ uint64_t num_blocks;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+ uint32_t *murmur3_x64_128_digest;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ if (ctx == NULL)
+ return MH_SHA1_MURMUR3_CTX_ERROR_NULL;
+
+ if (len == 0)
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+
+ partial_block_len = ctx->total_length % MH_SHA1_BLOCK_SIZE;
+ partial_block_buffer = ctx->partial_block_buffer;
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+ murmur3_x64_128_digest = ctx->murmur3_x64_128_digest;
+
+ ctx->total_length += len;
+ // No enough input data for mh_sha1 calculation
+ if (len + partial_block_len < MH_SHA1_BLOCK_SIZE) {
+ memcpy(partial_block_buffer + partial_block_len, input_data, len);
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+ }
+ // mh_sha1 calculation for the previous partial block
+ if (partial_block_len != 0) {
+ memcpy(partial_block_buffer + partial_block_len, input_data,
+ MH_SHA1_BLOCK_SIZE - partial_block_len);
+ //do one_block process
+ BLOCK_FUNCTION(partial_block_buffer, mh_sha1_segs_digests,
+ aligned_frame_buffer, murmur3_x64_128_digest, 1);
+ input_data += MH_SHA1_BLOCK_SIZE - partial_block_len;
+ len -= MH_SHA1_BLOCK_SIZE - partial_block_len;
+ memset(partial_block_buffer, 0, MH_SHA1_BLOCK_SIZE);
+ }
+ // Calculate mh_sha1 for the current blocks
+ num_blocks = len / MH_SHA1_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ BLOCK_FUNCTION(input_data, mh_sha1_segs_digests, aligned_frame_buffer,
+ murmur3_x64_128_digest, num_blocks);
+ len -= num_blocks * MH_SHA1_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA1_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+
+}
+
+#ifdef UPDATE_FUNCTION_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+ // Version info
+struct slver mh_sha1_murmur3_x64_128_update_base_slver_0000025a;
+struct slver mh_sha1_murmur3_x64_128_update_base_slver = { 0x025a, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c
new file mode 100644
index 000000000..6ae888e21
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c
@@ -0,0 +1,272 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1_murmur3_x64_128.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \
+ printf("The stitch function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * murmur3_x64_128_digest);
+
+void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest)
+{
+ mh_sha1_ref(buffer, len, mh_sha1_digest);
+ murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest);
+
+ return;
+}
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS],
+ uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+ int murmur3_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) {
+ if (murmur3_test[i] != murmur3_base[i])
+ murmur3_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+ if (murmur3_fail) {
+ printf("murmur3 fail test\n");
+ printf("base: ");
+ dump((char *)murmur3_base, 16);
+ printf("ref: ");
+ dump((char *)murmur3_test, 16);
+ }
+
+ return mh_sha1_fail + murmur3_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0, i;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS],
+ murmur3_base[MURMUR3_x64_128_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int update_count;
+ int size1, size2, offset, addr_offset;
+ struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL;
+ uint8_t *mem_addr = NULL;
+
+ printf(" " xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages by update twice.
+ printf("\n various size messages by update twice tests");
+ for (size1 = TEST_LEN; size1 >= 0; size1--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+ murmur3_base);
+
+ // subsequent update
+ size2 = TEST_LEN - size1; // size2 is different with the former
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various update count
+ printf("\n various update count tests");
+ for (update_count = 1; update_count <= TEST_LEN; update_count++) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+ murmur3_base);
+
+ // subsequent update
+ size1 = TEST_LEN / update_count;
+ size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ for (i = 1, offset = 0; i < update_count; i++) {
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1));
+ offset += size1;
+ }
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // test various start address of ctx.
+ printf("\n various start address of ctx test");
+ free(update_ctx);
+ mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10);
+ for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+ murmur3_base);
+
+ // a unaligned offset
+ update_ctx = (struct mh_sha1_murmur3_x64_128_ctx *)(mem_addr + addr_offset);
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail addr_offset=%d\n", addr_offset);
+ return -1;
+ }
+
+ if ((addr_offset & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c
new file mode 100644
index 000000000..f5fe30a83
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c
@@ -0,0 +1,85 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h> // for NULL
+#include "murmur3_x64_128_internal.c"
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX2 __attribute__ ((optimize(1)))
+#else
+# define OPT_FIX2
+#endif
+
+/*******************************************************************
+ * Single API which can calculate murmur3
+ ******************************************************************/
+/**
+ * @brief Get the digest of murmur3_x64_128 through a single API.
+ *
+ * Using murmur3_x64_128_block and murmur3_x64_128_tail.
+ * Used to test the murmur3_x64_128 digest.
+ *
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param murmur_seed Seed as an initial digest of murmur3
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns none
+ *
+ */
+void OPT_FIX2 murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * murmur3_x64_128_digest)
+{
+ uint64_t *murmur3_x64_128_hash;
+ uint32_t murmur3_x64_128_hash_dword[4];
+ uint8_t *tail_buffer;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ // Initiate murmur3
+ murmur3_x64_128_hash = (uint64_t *) murmur3_x64_128_hash_dword;
+ murmur3_x64_128_hash[0] = murmur_seed;
+ murmur3_x64_128_hash[1] = murmur_seed;
+
+ // process bodies
+ murmur3_x64_128_block((uint8_t *) input_data, len / MUR_BLOCK_SIZE,
+ murmur3_x64_128_hash_dword);
+
+ // process finalize
+ tail_buffer = (uint8_t *) input_data + len - len % MUR_BLOCK_SIZE;
+ murmur3_x64_128_tail(tail_buffer, len, murmur3_x64_128_hash_dword);
+
+ // output the digests
+ if (murmur3_x64_128_digest != NULL) {
+ murmur3_x64_128_digest[0] = murmur3_x64_128_hash_dword[0];
+ murmur3_x64_128_digest[1] = murmur3_x64_128_hash_dword[1];
+ murmur3_x64_128_digest[2] = murmur3_x64_128_hash_dword[2];
+ murmur3_x64_128_digest[3] = murmur3_x64_128_hash_dword[3];
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c
new file mode 100644
index 000000000..67eabd0c4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c
@@ -0,0 +1,138 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha1_murmur3_x64_128_internal.h"
+#include <stdlib.h> // for NULL
+
+/* murmur3_x64_128 constants */
+// Shift bits of circle rotate
+#define MUR_SH1 31
+#define MUR_SH2 33
+#define MUR_SH3 27
+#define MUR_SH4 31
+#define MUR_SH5 33
+
+#define MUR_MUL 5
+#define MUR_ADD1 0x52dce729
+#define MUR_ADD2 0x38495ab5
+
+#define MUR_CON1 0x87c37b91114253d5LLU
+#define MUR_CON2 0x4cf5ad432745937fLLU
+
+#define MUR_FMUL1 0xff51afd7ed558ccdLLU
+#define MUR_FMUL2 0xc4ceb9fe1a85ec53LLU
+
+/* murmur3_x64_128 inline functions */
+static inline uint64_t blockmix64(uint64_t data, uint64_t conA, uint64_t conB, uint64_t shift)
+{
+ data *= conA;
+ data = (data << shift) | (data >> (64 - shift));
+ data *= conB;
+ return data;
+}
+
+static inline uint64_t hashmix64(uint64_t hashA, uint64_t hashB, uint64_t data, uint64_t add,
+ uint64_t shift)
+{
+ hashA ^= data;
+ hashA = (hashA << shift) | (hashA >> (64 - shift));
+ hashA += hashB;
+ hashA = hashA * MUR_MUL + add;
+ return hashA;
+}
+
+void murmur3_x64_128_block(const uint8_t * input_data, uint32_t num_blocks,
+ uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ uint64_t data1, data2;
+ uint64_t *input_qword = (uint64_t *) input_data;
+ uint64_t *hash = (uint64_t *) digests;
+ uint32_t i = 0;
+
+ while (i < num_blocks) {
+ data1 = input_qword[i * 2];
+ data2 = input_qword[i * 2 + 1];
+ data1 = blockmix64(data1, MUR_CON1, MUR_CON2, MUR_SH1);
+ data2 = blockmix64(data2, MUR_CON2, MUR_CON1, MUR_SH2);
+ hash[0] = hashmix64(hash[0], hash[1], data1, MUR_ADD1, MUR_SH3);
+ hash[1] = hashmix64(hash[1], hash[0], data2, MUR_ADD2, MUR_SH4);
+ i++;
+ }
+
+ return;
+}
+
+void murmur3_x64_128_tail(const uint8_t * tail_buffer, uint32_t total_len,
+ uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ uint64_t data1, data2;
+ uint64_t *hash = (uint64_t *) digests;
+ uint64_t tail_len = total_len % 16;
+ uint8_t *tail = (uint8_t *) tail_buffer;
+
+ union {
+ uint64_t hash[2];
+ uint8_t hashB[16];
+ } hashU;
+
+ // tail
+ hashU.hash[0] = hashU.hash[1] = 0;
+
+ while (tail_len-- > 0)
+ hashU.hashB[tail_len] = tail[tail_len];
+
+ data1 = hashU.hash[0];
+ data2 = hashU.hash[1];
+
+ data1 = blockmix64(data1, MUR_CON1, MUR_CON2, MUR_SH1);
+ data2 = blockmix64(data2, MUR_CON2, MUR_CON1, MUR_SH2);
+
+ hash[0] ^= total_len ^ data1;
+ hash[1] ^= total_len ^ data2;
+
+ hash[0] += hash[1];
+ hash[1] += hash[0];
+
+ hash[0] ^= hash[0] >> MUR_SH5;
+ hash[0] *= MUR_FMUL1;
+ hash[0] ^= hash[0] >> MUR_SH5;
+ hash[0] *= MUR_FMUL2;
+ hash[0] ^= hash[0] >> MUR_SH5;
+
+ hash[1] ^= hash[1] >> MUR_SH5;
+ hash[1] *= MUR_FMUL1;
+ hash[1] ^= hash[1] >> MUR_SH5;
+ hash[1] *= MUR_FMUL2;
+ hash[1] ^= hash[1] >> MUR_SH5;
+
+ hash[0] += hash[1];
+ hash[1] += hash[0];
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am
new file mode 100644
index 000000000..d6e8b61ab
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/Makefile.am
@@ -0,0 +1,88 @@
+########################################################################
+# Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_sha256 = mh_sha256/sha256_for_mh_sha256.c
+
+lsrc_mh_sha256 = mh_sha256/mh_sha256.c \
+ mh_sha256/mh_sha256_block_sse.asm \
+ mh_sha256/mh_sha256_block_avx.asm \
+ mh_sha256/mh_sha256_block_avx2.asm \
+ mh_sha256/mh_sha256_multibinary.asm \
+ mh_sha256/mh_sha256_finalize_base.c \
+ mh_sha256/mh_sha256_update_base.c \
+ mh_sha256/mh_sha256_block_base.c
+
+lsrc_mh_sha256 += mh_sha256/mh_sha256_block_avx512.asm \
+ mh_sha256/mh_sha256_avx512.c
+
+lsrc_x86_64 += $(lsrc_sha256) \
+ $(lsrc_mh_sha256)
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+other_src += mh_sha256/mh_sha256_ref.c \
+ include/reg_sizes.asm \
+ include/multibinary.asm \
+ include/test.h \
+ mh_sha256/mh_sha256_internal.h
+
+lsrc_aarch64 += $(lsrc_sha256) \
+ mh_sha256/aarch64/mh_sha256_multibinary.S \
+ mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c \
+ mh_sha256/aarch64/mh_sha256_block_ce.S \
+ mh_sha256/aarch64/mh_sha256_ce.c \
+ mh_sha256/mh_sha256.c \
+ mh_sha256/mh_sha256_finalize_base.c \
+ mh_sha256/mh_sha256_update_base.c \
+ mh_sha256/mh_sha256_block_base.c
+
+lsrc_base_aliases += $(lsrc_sha256) \
+ mh_sha256/mh_sha256_base_aliases.c \
+ mh_sha256/mh_sha256.c \
+ mh_sha256/mh_sha256_finalize_base.c \
+ mh_sha256/mh_sha256_update_base.c \
+ mh_sha256/mh_sha256_block_base.c
+
+src_include += -I $(srcdir)/mh_sha256
+
+extern_hdrs += include/mh_sha256.h
+
+check_tests += mh_sha256/mh_sha256_test
+unit_tests += mh_sha256/mh_sha256_update_test
+
+perf_tests += mh_sha256/mh_sha256_perf
+
+
+mh_sha256_test: mh_sha256_ref.o
+mh_sha256_mh_sha256_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la
+
+mh_sha256_update_test: mh_sha256_ref.o
+mh_sha256_mh_sha256_update_test_LDADD = mh_sha256/mh_sha256_ref.lo libisal_crypto.la
+
+mh_sha256_mh_sha256_perf_LDADD = libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c
new file mode 100644
index 000000000..155790fc1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_aarch64_dispatcher.c
@@ -0,0 +1,49 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha256_update)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(mh_sha256_update_ce);
+
+ return PROVIDER_BASIC(mh_sha256_update);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(mh_sha256_finalize)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(mh_sha256_finalize_ce);
+
+ return PROVIDER_BASIC(mh_sha256_finalize);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S
new file mode 100644
index 000000000..53a78ea7d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_block_ce.S
@@ -0,0 +1,731 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 6
+
+ .global mh_sha256_block_ce
+ .type mh_sha256_block_ce, %function
+
+/*
+Macros
+*/
+
+.macro declare_vector_reg name:req,reg:req,default:req
+ \name .req \default\reg
+ q_\name .req q\reg
+ v_\name .req v\reg
+ s_\name .req s\reg
+.endm
+
+declare_vector_reg lane0_msg0, 0,v
+declare_vector_reg lane1_msg0, 1,v
+declare_vector_reg lane2_msg0, 2,v
+declare_vector_reg lane3_msg0, 3,v
+
+declare_vector_reg lane0_msg1, 4,v
+declare_vector_reg lane1_msg1, 5,v
+declare_vector_reg lane2_msg1, 6,v
+declare_vector_reg lane3_msg1, 7,v
+
+declare_vector_reg lane0_msg2, 8,v
+declare_vector_reg lane1_msg2, 9,v
+declare_vector_reg lane2_msg2, 10,v
+declare_vector_reg lane3_msg2, 11,v
+
+declare_vector_reg lane0_msg3, 12,v
+declare_vector_reg lane1_msg3, 13,v
+declare_vector_reg lane2_msg3, 14,v
+declare_vector_reg lane3_msg3, 15,v
+
+declare_vector_reg lane0_state0, 16,v
+declare_vector_reg lane1_state0, 17,v
+declare_vector_reg lane2_state0, 18,v
+declare_vector_reg lane3_state0, 19,v
+
+declare_vector_reg lane0_state1, 20,v
+declare_vector_reg lane1_state1, 21,v
+declare_vector_reg lane2_state1, 22,v
+declare_vector_reg lane3_state1, 23,v
+
+declare_vector_reg lane0_tmp0, 24,v
+declare_vector_reg lane1_tmp0, 25,v
+declare_vector_reg lane2_tmp0, 26,v
+declare_vector_reg lane3_tmp0, 27,v
+
+declare_vector_reg lane0_tmp2, 28,v
+declare_vector_reg lane1_tmp2, 29,v
+declare_vector_reg lane2_tmp2, 30,v
+declare_vector_reg lane3_tmp2, 31,v
+
+declare_vector_reg key, 27,v
+declare_vector_reg tmp, 29,v
+
+/*
+void mh_sha256_block_ce(const uint8_t * input_data,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE],
+ uint32_t num_blocks);
+*/
+ x_input_data .req x0
+ x_digests .req x1
+ x_frame_buffer .req x2
+ w_num_blocks .req w3
+
+ x_digest_addr .req x4
+ x_key_addr .req x5
+ x_msg_addr .req x6
+ x_lane_offs .req x7
+ x_offs .req x9
+ w_input_data_end .req w10
+ x_input_data_end .req x10
+ x_tmp .req x11
+mh_sha256_block_ce:
+ cbz w_num_blocks, .exit
+ mov w_input_data_end, w_num_blocks
+
+ ubfiz x_input_data_end, x_input_data_end, 10, 32
+ add x_input_data_end, x_input_data, x_input_data_end
+
+ adrp x_key_addr, .key_addr
+ add x_key_addr, x_key_addr, :lo12:.key_addr
+
+ stp d8, d9, [sp, -192]!
+
+ stp d10, d11, [sp, 16]
+ stp d12, d13, [sp, 32]
+ stp d14, d15, [sp, 48]
+
+ .p2align 3,,7
+.start_loop:
+ mov x_lane_offs, 0
+ mov x_digest_addr, x_digests
+
+.lane_loop:
+ add x_msg_addr, x_input_data, x_lane_offs, lsl 2
+
+ .p2align 3,,7
+ mov x_offs, 64
+ mov x_tmp, x_digest_addr
+ ld4 {v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs
+ ld4 {v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs
+ ld4 {v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs
+ ld4 {v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs
+
+ add x_tmp, x_digest_addr, 256
+ ld4 {v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs
+ ld4 {v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs
+ ld4 {v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs
+ ld4 {v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs
+
+ ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[0], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[1], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[2], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg0.S-v_lane3_msg0.S}[3], [x_msg_addr], x_offs
+
+ ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[0], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[1], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[2], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg1.S-v_lane3_msg1.S}[3], [x_msg_addr], x_offs
+
+ ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[0], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[1], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[2], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg2.S-v_lane3_msg2.S}[3], [x_msg_addr], x_offs
+
+ ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[0], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[1], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[2], [x_msg_addr], x_offs
+ ld4 {v_lane0_msg3.S-v_lane3_msg3.S}[3], [x_msg_addr], x_offs
+
+ // reverse for little endian
+ rev32 v_lane0_msg0.16b, v_lane0_msg0.16b
+ rev32 v_lane1_msg0.16b, v_lane1_msg0.16b
+ rev32 v_lane2_msg0.16b, v_lane2_msg0.16b
+ rev32 v_lane3_msg0.16b, v_lane3_msg0.16b
+
+ rev32 v_lane0_msg1.16b, v_lane0_msg1.16b
+ rev32 v_lane1_msg1.16b, v_lane1_msg1.16b
+ rev32 v_lane2_msg1.16b, v_lane2_msg1.16b
+ rev32 v_lane3_msg1.16b, v_lane3_msg1.16b
+
+ rev32 v_lane0_msg2.16b, v_lane0_msg2.16b
+ rev32 v_lane1_msg2.16b, v_lane1_msg2.16b
+ rev32 v_lane2_msg2.16b, v_lane2_msg2.16b
+ rev32 v_lane3_msg2.16b, v_lane3_msg2.16b
+
+ rev32 v_lane0_msg3.16b, v_lane0_msg3.16b
+ rev32 v_lane1_msg3.16b, v_lane1_msg3.16b
+ rev32 v_lane2_msg3.16b, v_lane2_msg3.16b
+ rev32 v_lane3_msg3.16b, v_lane3_msg3.16b
+
+ // rounds 0-3
+ ldr q_key, [x_key_addr]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ str q_lane0_state1, [sp, 64]
+ str q_lane1_state1, [sp, 80]
+ str q_lane2_state1, [sp, 96]
+ str q_lane3_state1, [sp, 112]
+
+ mov x_offs, 64
+ mov x_tmp, x_digest_addr
+ ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[0], [x_tmp], x_offs
+ ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[1], [x_tmp], x_offs
+ ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[2], [x_tmp], x_offs
+ ld4 {v_lane0_tmp2.S-v_lane3_tmp2.S}[3], [x_tmp], x_offs
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ // rounds 4-7
+ ldr q_key, [x_key_addr, 16]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ // rounds 8-11
+ ldr q_key, [x_key_addr, 32]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ // rounds 12-15
+ ldr q_key, [x_key_addr, 48]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ // rounds 16-19
+ ldr q_key, [x_key_addr, 64]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ // rounds 20-23
+ ldr q_key, [x_key_addr, 80]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ // rounds 24-27
+ ldr q_key, [x_key_addr, 96]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ // rounds 28-31
+ ldr q_key, [x_key_addr, 112]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ // rounds 32-35
+ ldr q_key, [x_key_addr, 128]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su0 v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su0 v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su0 v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ sha256su1 v_lane0_msg0.4s, v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su1 v_lane1_msg0.4s, v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su1 v_lane2_msg0.4s, v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su1 v_lane3_msg0.4s, v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ // rounds 36-39
+ ldr q_key, [x_key_addr, 144]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su0 v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su0 v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su0 v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ sha256su1 v_lane0_msg1.4s, v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su1 v_lane1_msg1.4s, v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su1 v_lane2_msg1.4s, v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su1 v_lane3_msg1.4s, v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ // rounds 40-43
+ ldr q_key, [x_key_addr, 160]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg2.4s, v_lane0_msg3.4s
+ sha256su0 v_lane1_msg2.4s, v_lane1_msg3.4s
+ sha256su0 v_lane2_msg2.4s, v_lane2_msg3.4s
+ sha256su0 v_lane3_msg2.4s, v_lane3_msg3.4s
+
+ sha256su1 v_lane0_msg2.4s, v_lane0_msg0.4s, v_lane0_msg1.4s
+ sha256su1 v_lane1_msg2.4s, v_lane1_msg0.4s, v_lane1_msg1.4s
+ sha256su1 v_lane2_msg2.4s, v_lane2_msg0.4s, v_lane2_msg1.4s
+ sha256su1 v_lane3_msg2.4s, v_lane3_msg0.4s, v_lane3_msg1.4s
+
+ // rounds 44-47
+ ldr q_key, [x_key_addr, 176]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ sha256su0 v_lane0_msg3.4s, v_lane0_msg0.4s
+ sha256su0 v_lane1_msg3.4s, v_lane1_msg0.4s
+ sha256su0 v_lane2_msg3.4s, v_lane2_msg0.4s
+ sha256su0 v_lane3_msg3.4s, v_lane3_msg0.4s
+
+ sha256su1 v_lane0_msg3.4s, v_lane0_msg1.4s, v_lane0_msg2.4s
+ sha256su1 v_lane1_msg3.4s, v_lane1_msg1.4s, v_lane1_msg2.4s
+ sha256su1 v_lane2_msg3.4s, v_lane2_msg1.4s, v_lane2_msg2.4s
+ sha256su1 v_lane3_msg3.4s, v_lane3_msg1.4s, v_lane3_msg2.4s
+
+ // rounds 48-51
+ ldr q_key, [x_key_addr, 192]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg0.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg0.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg0.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg0.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ // rounds 52-55
+ ldr q_key, [x_key_addr, 208]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg1.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg1.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg1.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg1.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ // rounds 56-59
+ ldr q_key, [x_key_addr, 224]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg2.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg2.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg2.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg2.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ // rounds 60-63
+ ldr q_key, [x_key_addr, 240]
+ add v_lane0_tmp0.4s, v_key.4s, v_lane0_msg3.4s
+ add v_lane1_tmp0.4s, v_key.4s, v_lane1_msg3.4s
+ add v_lane2_tmp0.4s, v_key.4s, v_lane2_msg3.4s
+ add v_lane3_tmp0.4s, v_key.4s, v_lane3_msg3.4s
+
+ mov v_lane0_tmp2.16b, v_lane0_state0.16b
+ mov v_lane1_tmp2.16b, v_lane1_state0.16b
+ mov v_lane2_tmp2.16b, v_lane2_state0.16b
+ mov v_lane3_tmp2.16b, v_lane3_state0.16b
+
+ sha256h q_lane0_state0, q_lane0_state1, v_lane0_tmp0.4s
+ sha256h q_lane1_state0, q_lane1_state1, v_lane1_tmp0.4s
+ sha256h q_lane2_state0, q_lane2_state1, v_lane2_tmp0.4s
+ sha256h q_lane3_state0, q_lane3_state1, v_lane3_tmp0.4s
+
+ sha256h2 q_lane0_state1, q_lane0_tmp2, v_lane0_tmp0.4s
+ sha256h2 q_lane1_state1, q_lane1_tmp2, v_lane1_tmp0.4s
+ sha256h2 q_lane2_state1, q_lane2_tmp2, v_lane2_tmp0.4s
+ sha256h2 q_lane3_state1, q_lane3_tmp2, v_lane3_tmp0.4s
+
+ mov x_offs, 64
+ mov x_tmp, x_digest_addr
+ ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[0], [x_tmp], x_offs
+ ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[1], [x_tmp], x_offs
+ ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[2], [x_tmp], x_offs
+ ld4 {v_lane0_tmp0.S-v_lane3_tmp0.S}[3], [x_tmp], x_offs
+
+ add v_lane0_state0.4s, v_lane0_tmp0.4s, v_lane0_state0.4s
+ add v_lane1_state0.4s, v_lane1_tmp0.4s, v_lane1_state0.4s
+ add v_lane2_state0.4s, v_lane2_tmp0.4s, v_lane2_state0.4s
+ add v_lane3_state0.4s, v_lane3_tmp0.4s, v_lane3_state0.4s
+
+ mov x_offs, 64
+ mov x_tmp, x_digest_addr
+ st4 {v_lane0_state0.S-v_lane3_state0.S}[0], [x_tmp], x_offs
+ st4 {v_lane0_state0.S-v_lane3_state0.S}[1], [x_tmp], x_offs
+ st4 {v_lane0_state0.S-v_lane3_state0.S}[2], [x_tmp], x_offs
+ st4 {v_lane0_state0.S-v_lane3_state0.S}[3], [x_tmp], x_offs
+
+ ldp q_lane0_tmp2, q_lane1_tmp2, [sp, 64]
+ ldp q_lane2_tmp2, q_lane3_tmp2, [sp, 96]
+
+ add v_lane0_state1.4s, v_lane0_tmp2.4s, v_lane0_state1.4s
+ add v_lane1_state1.4s, v_lane1_tmp2.4s, v_lane1_state1.4s
+ add v_lane2_state1.4s, v_lane2_tmp2.4s, v_lane2_state1.4s
+ add v_lane3_state1.4s, v_lane3_tmp2.4s, v_lane3_state1.4s
+
+ mov x_offs, 64
+ add x_tmp, x_digest_addr, 256
+ st4 {v_lane0_state1.S-v_lane3_state1.S}[0], [x_tmp], x_offs
+ st4 {v_lane0_state1.S-v_lane3_state1.S}[1], [x_tmp], x_offs
+ st4 {v_lane0_state1.S-v_lane3_state1.S}[2], [x_tmp], x_offs
+ st4 {v_lane0_state1.S-v_lane3_state1.S}[3], [x_tmp], x_offs
+
+ add x_digest_addr, x_digest_addr, 16
+ add x_lane_offs, x_lane_offs, 4
+ cmp x_lane_offs, 16
+ bne .lane_loop
+
+ add x_input_data, x_input_data, 1024
+ cmp x_input_data, x_input_data_end
+ bne .start_loop
+
+ ldp d10, d11, [sp, 16]
+ ldp d12, d13, [sp, 32]
+ ldp d14, d15, [sp, 48]
+ ldp d8, d9, [sp], 192
+.exit:
+ ret
+ .size mh_sha256_block_ce, .-mh_sha256_block_ce
+
+ .section .rodata
+ .align 4
+ .set .key_addr,. + 0
+ .type K, %object
+ .size K, 256
+K:
+ .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+ .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+ .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+ .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+ .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+ .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+ .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+ .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c
new file mode 100644
index 000000000..c42333ed5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_ce.c
@@ -0,0 +1,53 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+void mh_sha256_block_ce(const uint8_t * input_data,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+/***************mh_sha256_update***********/
+// mh_sha256_update_ce.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_ce
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_ce
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************mh_sha256_finalize AND mh_sha256_tail***********/
+// mh_sha256_tail is used to calculate the last incomplete src data block
+// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail
+// mh_sha256_finalize_ce.c and mh_sha256_tail_ce.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_ce
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_ce
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_ce
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S
new file mode 100644
index 000000000..54eece175
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/aarch64/mh_sha256_multibinary.S
@@ -0,0 +1,35 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface mh_sha256_update
+mbin_interface mh_sha256_finalize
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c
new file mode 100644
index 000000000..242c3e218
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+int mh_sha256_init(struct mh_sha256_ctx *ctx)
+{
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS];
+ uint32_t i;
+
+ if (ctx == NULL)
+ return MH_SHA256_CTX_ERROR_NULL;
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests;
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha256_segs_digests[0][i] = MH_SHA256_H0;
+ mh_sha256_segs_digests[1][i] = MH_SHA256_H1;
+ mh_sha256_segs_digests[2][i] = MH_SHA256_H2;
+ mh_sha256_segs_digests[3][i] = MH_SHA256_H3;
+ mh_sha256_segs_digests[4][i] = MH_SHA256_H4;
+ mh_sha256_segs_digests[5][i] = MH_SHA256_H5;
+ mh_sha256_segs_digests[6][i] = MH_SHA256_H6;
+ mh_sha256_segs_digests[7][i] = MH_SHA256_H7;
+ }
+
+ return MH_SHA256_CTX_ERROR_NONE;
+}
+
+#if (!defined(NOARCH)) && (defined(__i386__) || defined(__x86_64__) \
+ || defined( _M_X64) || defined(_M_IX86))
+/***************mh_sha256_update***********/
+// mh_sha256_update_sse.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_sse
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_sse
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_update_avx.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_update_avx2.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx2
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx2
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************mh_sha256_finalize AND mh_sha256_tail***********/
+// mh_sha256_tail is used to calculate the last incomplete src data block
+// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail
+
+// mh_sha256_finalize_sse.c and mh_sha256_tail_sse.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_sse
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_sse
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_sse
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_finalize_avx.c and mh_sha256_tail_avx.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+// mh_sha256_finalize_avx2.c and mh_sha256_tail_avx2.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx2
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx2
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx2
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+// Version info
+struct slver mh_sha256_init_slver_000002b1;
+struct slver mh_sha256_init_slver = { 0x02b1, 0x00, 0x00 };
+
+// mh_sha256_update version info
+struct slver mh_sha256_update_sse_slver_000002b4;
+struct slver mh_sha256_update_sse_slver = { 0x02b4, 0x00, 0x00 };
+
+struct slver mh_sha256_update_avx_slver_020002b6;
+struct slver mh_sha256_update_avx_slver = { 0x02b6, 0x00, 0x02 };
+
+struct slver mh_sha256_update_avx2_slver_040002b8;
+struct slver mh_sha256_update_avx2_slver = { 0x02b8, 0x00, 0x04 };
+
+// mh_sha256_finalize version info
+struct slver mh_sha256_finalize_sse_slver_000002b5;
+struct slver mh_sha256_finalize_sse_slver = { 0x02b5, 0x00, 0x00 };
+
+struct slver mh_sha256_finalize_avx_slver_020002b7;
+struct slver mh_sha256_finalize_avx_slver = { 0x02b7, 0x00, 0x02 };
+
+struct slver mh_sha256_finalize_avx2_slver_040002b9;
+struct slver mh_sha256_finalize_avx2_slver = { 0x02b9, 0x00, 0x04 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c
new file mode 100644
index 000000000..35fb0fbad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_avx512.c
@@ -0,0 +1,70 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+/***************mh_sha256_update***********/
+// mh_sha256_update_avx512.c
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_avx512
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx512
+#include "mh_sha256_update_base.c"
+#undef MH_SHA256_UPDATE_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************mh_sha256_finalize AND mh_sha256_tail***********/
+// mh_sha256_tail is used to calculate the last incomplete src data block
+// mh_sha256_finalize is a mh_sha256_ctx wrapper of mh_sha256_tail
+// mh_sha256_finalize_avx512.c and mh_sha256_tail_avx512.c
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_avx512
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_avx512
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_avx512
+#include "mh_sha256_finalize_base.c"
+#undef MH_SHA256_FINALIZE_FUNCTION
+#undef MH_SHA256_TAIL_FUNCTION
+#undef MH_SHA256_BLOCK_FUNCTION
+
+/***************version info***********/
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// mh_sha256_update version info
+struct slver mh_sha256_update_avx512_slver_060002bc;
+struct slver mh_sha256_update_avx512_slver = { 0x02bc, 0x00, 0x06 };
+
+// mh_sha256_finalize version info
+struct slver mh_sha256_finalize_avx512_slver_060002bd;
+struct slver mh_sha256_finalize_avx512_slver = { 0x02bd, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c
new file mode 100644
index 000000000..343ffb024
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_base_aliases.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "mh_sha256_internal.h"
+#include <string.h>
+int mh_sha256_update(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len)
+{
+ return mh_sha256_update_base(ctx, buffer, len);
+
+}
+
+int mh_sha256_finalize(struct mh_sha256_ctx *ctx, void *mh_sha256_digest)
+{
+ return mh_sha256_finalize_base(ctx, mh_sha256_digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm
new file mode 100644
index 000000000..c2eff350d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx.asm
@@ -0,0 +1,557 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha256
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS
+
+; Common definitions
+%define ROUND tmp4
+%define TBL tmp5
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_R 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa %%T1, [SZ4*(%%i&0xf) + %%data]
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_W 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ4*(%%i&0xf) + %%data], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + %%data]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + %%data]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + %%data]
+ vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + %%data]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15_W %%T1, %%i, %%data
+%endm
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp |
+
+align 32
+
+;void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+mk_global mh_sha256_block_avx, function, internal
+func(mh_sha256_block_avx)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by avx
+ and rsp, ~0x0F
+ lea TBL,[TABLE]
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 8
+ VMOVPS a, [mh_digests_p + I*64 + 16*0]
+ VMOVPS b, [mh_digests_p + I*64 + 16*1]
+ VMOVPS c, [mh_digests_p + I*64 + 16*2]
+ VMOVPS d, [mh_digests_p + I*64 + 16*3]
+
+ vmovdqa [rsp + I*64 + 16*0], a
+ vmovdqa [rsp + I*64 + 16*1], b
+ vmovdqa [rsp + I*64 + 16*2], c
+ vmovdqa [rsp + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4
+ %assign I 0
+ %rep 16
+ VMOVPS TT0,[mh_in_p + I*64+0*16]
+ VMOVPS TT1,[mh_in_p + I*64+1*16]
+ VMOVPS TT2,[mh_in_p + I*64+2*16]
+ VMOVPS TT3,[mh_in_p + I*64+3*16]
+
+ vpshufb TT0, TMP
+ vmovdqa [mh_data_p +(I)*16 +0*256],TT0
+ vpshufb TT1, TMP
+ vmovdqa [mh_data_p +(I)*16 +1*256],TT1
+ vpshufb TT2, TMP
+ vmovdqa [mh_data_p +(I)*16 +2*256],TT2
+ vpshufb TT3, TMP
+ vmovdqa [mh_data_p +(I)*16 +3*256],TT3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ xor ROUND, ROUND
+ ;; Initialize digests
+ vmovdqa a, [rsp + 0*64 + mh_segs]
+ vmovdqa b, [rsp + 1*64 + mh_segs]
+ vmovdqa c, [rsp + 2*64 + mh_segs]
+ vmovdqa d, [rsp + 3*64 + mh_segs]
+ vmovdqa e, [rsp + 4*64 + mh_segs]
+ vmovdqa f, [rsp + 5*64 + mh_segs]
+ vmovdqa g, [rsp + 6*64 + mh_segs]
+ vmovdqa h, [rsp + 7*64 + mh_segs]
+
+ %assign i 0
+ %rep 4
+ ROUND_00_15_R TT0, (i*4+0), mh_data_p
+ ROUND_00_15_R TT1, (i*4+1), mh_data_p
+ ROUND_00_15_R TT2, (i*4+2), mh_data_p
+ ROUND_00_15_R TT3, (i*4+3), mh_data_p
+ %assign i (i+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+
+ %assign i 16
+ %rep 48
+ %if i = 48
+ PREFETCH_X [mh_in_p + pref+128*1]
+ %endif
+ ROUND_16_XX T1, i, mh_data_p
+ %assign i (i+1)
+ %endrep
+
+ ;; add old digest
+ vpaddd a, a, [rsp + 0*64 + mh_segs]
+ vpaddd b, b, [rsp + 1*64 + mh_segs]
+ vpaddd c, c, [rsp + 2*64 + mh_segs]
+ vpaddd d, d, [rsp + 3*64 + mh_segs]
+ vpaddd e, e, [rsp + 4*64 + mh_segs]
+ vpaddd f, f, [rsp + 5*64 + mh_segs]
+ vpaddd g, g, [rsp + 6*64 + mh_segs]
+ vpaddd h, h, [rsp + 7*64 + mh_segs]
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], a
+ vmovdqa [rsp + 1*64 + mh_segs], b
+ vmovdqa [rsp + 2*64 + mh_segs], c
+ vmovdqa [rsp + 3*64 + mh_segs], d
+ vmovdqa [rsp + 4*64 + mh_segs], e
+ vmovdqa [rsp + 5*64 + mh_segs], f
+ vmovdqa [rsp + 6*64 + mh_segs], g
+ vmovdqa [rsp + 7*64 + mh_segs], h
+
+ add pref, 256
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 8
+ vmovdqa a, [rsp + I*64 + 16*0]
+ vmovdqa b, [rsp + I*64 + 16*1]
+ vmovdqa c, [rsp + I*64 + 16*2]
+ vmovdqa d, [rsp + I*64 + 16*3]
+
+ VMOVPS [mh_digests_p + I*64 + 16*0], a
+ VMOVPS [mh_digests_p + I*64 + 16*1], b
+ VMOVPS [mh_digests_p + I*64 + 16*2], c
+ VMOVPS [mh_digests_p + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm
new file mode 100644
index 000000000..c2b3f2c59
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx2.asm
@@ -0,0 +1,616 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX-2
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha256
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS
+
+; Common definitions
+%define ROUND tmp4
+%define TBL tmp5
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define SZ 4
+%define SZ8 8*SZ
+%define ROUNDS 64*SZ8
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define a0 ymm8
+%define a1 ymm9
+%define a2 ymm10
+
+%define TT0 ymm14
+%define TT1 ymm13
+%define TT2 ymm12
+%define TT3 ymm11
+%define TT4 ymm10
+%define TT5 ymm9
+
+%define T1 ymm14
+%define TMP ymm15
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_R 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa %%T1, [SZ8*(%%i&0xf) + %%data]
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ8 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_W 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ8*(%%i&0xf) + %%data], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ8 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + %%data]
+ vmovdqa a1, [SZ8*((%%i-2)&0xf) + %%data]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + %%data]
+ vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + %%data]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15_W %%T1, %%i, %%data
+%endm
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp |
+
+align 32
+
+;void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+mk_global mh_sha256_block_avx2, function, internal
+func(mh_sha256_block_avx2)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 32 Bytes needed by avx2
+ and rsp, ~0x1F
+ lea TBL,[TABLE]
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 4
+ VMOVPS a, [mh_digests_p + I*64*2 + 32*0]
+ VMOVPS b, [mh_digests_p + I*64*2 + 32*1]
+ VMOVPS c, [mh_digests_p + I*64*2 + 32*2]
+ VMOVPS d, [mh_digests_p + I*64*2 + 32*3]
+
+ vmovdqa [rsp + I*64*2 + 32*0], a
+ vmovdqa [rsp + I*64*2 + 32*1], b
+ vmovdqa [rsp + I*64*2 + 32*2], c
+ vmovdqa [rsp + I*64*2 + 32*3], d
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*8 to DWORD*8_SEGS*8*2
+ %assign I 0
+ %rep 16
+ VMOVPS TT0,[mh_in_p + I*64+0*32]
+ VMOVPS TT1,[mh_in_p + I*64+1*32]
+
+ vpshufb TT0, TT0, TMP
+ vmovdqa [mh_data_p +I*32 +0*512],TT0
+ vpshufb TT1, TT1, TMP
+ vmovdqa [mh_data_p +I*32 +1*512],TT1
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 8 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ xor ROUND, ROUND
+ ;; Initialize digests
+ vmovdqa a, [rsp + 0*64 + mh_segs]
+ vmovdqa b, [rsp + 1*64 + mh_segs]
+ vmovdqa c, [rsp + 2*64 + mh_segs]
+ vmovdqa d, [rsp + 3*64 + mh_segs]
+ vmovdqa e, [rsp + 4*64 + mh_segs]
+ vmovdqa f, [rsp + 5*64 + mh_segs]
+ vmovdqa g, [rsp + 6*64 + mh_segs]
+ vmovdqa h, [rsp + 7*64 + mh_segs]
+
+ %assign i 0
+ %rep 4
+ ROUND_00_15_R TT0, (i*4+0), mh_data_p
+ ROUND_00_15_R TT1, (i*4+1), mh_data_p
+ ROUND_00_15_R TT2, (i*4+2), mh_data_p
+ ROUND_00_15_R TT3, (i*4+3), mh_data_p
+ %assign i (i+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+
+ %assign i 16
+ %rep 48
+ ROUND_16_XX T1, i, mh_data_p
+ %if i % 16 = 8
+ PREFETCH_X [mh_in_p + pref+128*(i/16)]
+ %endif
+ %assign i (i+1)
+ %endrep
+
+ ;; add old digest
+ vpaddd a, a, [rsp + 0*64 + mh_segs]
+ vpaddd b, b, [rsp + 1*64 + mh_segs]
+ vpaddd c, c, [rsp + 2*64 + mh_segs]
+ vpaddd d, d, [rsp + 3*64 + mh_segs]
+ vpaddd e, e, [rsp + 4*64 + mh_segs]
+ vpaddd f, f, [rsp + 5*64 + mh_segs]
+ vpaddd g, g, [rsp + 6*64 + mh_segs]
+ vpaddd h, h, [rsp + 7*64 + mh_segs]
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], a
+ vmovdqa [rsp + 1*64 + mh_segs], b
+ vmovdqa [rsp + 2*64 + mh_segs], c
+ vmovdqa [rsp + 3*64 + mh_segs], d
+ vmovdqa [rsp + 4*64 + mh_segs], e
+ vmovdqa [rsp + 5*64 + mh_segs], f
+ vmovdqa [rsp + 6*64 + mh_segs], g
+ vmovdqa [rsp + 7*64 + mh_segs], h
+
+ add pref, 512
+ add mh_data_p, 512
+ add mh_segs, 32
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 4
+ vmovdqa a, [rsp + I*64*2 + 32*0]
+ vmovdqa b, [rsp + I*64*2 + 32*1]
+ vmovdqa c, [rsp + I*64*2 + 32*2]
+ vmovdqa d, [rsp + I*64*2 + 32*3]
+
+ VMOVPS [mh_digests_p + I*64*2 + 32*0], a
+ VMOVPS [mh_digests_p + I*64*2 + 32*1], b
+ VMOVPS [mh_digests_p + I*64*2 + 32*2], c
+ VMOVPS [mh_digests_p + I*64*2 + 32*3], d
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm
new file mode 100644
index 000000000..1ee76ddfc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_avx512.asm
@@ -0,0 +1,682 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha256
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS
+; Common definitions
+%define ROUND tmp4
+%define TBL tmp5
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define APPEND(a,b) a %+ b
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddd T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, T1, %%WT ; T1 = T1 + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpaddd D, D, T1 ; D = D + T1
+
+ vprord H, A, 2 ; ROR_2(A)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vmovdqa32 TMP0, A
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT %1
+%define %%OFFSET %2
+ mov inp0, [IN + (%%OFFSET*8)]
+ vmovups %%WT, [inp0+IDX]
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+global mh_sha256_block_avx512
+func(mh_sha256_block_avx512)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 64 Bytes needed by avx512
+ and rsp, ~0x3F
+ lea TBL,[TABLE]
+
+ ; copy segs_digests into stack and ZMM
+ VMOVPS A, [mh_digests_p + 64*0]
+ VMOVPS B, [mh_digests_p + 64*1]
+ VMOVPS C, [mh_digests_p + 64*2]
+ VMOVPS D, [mh_digests_p + 64*3]
+ VMOVPS E, [mh_digests_p + 64*4]
+ VMOVPS F, [mh_digests_p + 64*5]
+ VMOVPS G, [mh_digests_p + 64*6]
+ VMOVPS H, [mh_digests_p + 64*7]
+
+.block_loop:
+ ; Save digests for later addition
+ vmovdqa32 [rsp + 64*0], A
+ vmovdqa32 [rsp + 64*1], B
+ vmovdqa32 [rsp + 64*2], C
+ vmovdqa32 [rsp + 64*3], D
+ vmovdqa32 [rsp + 64*4], E
+ vmovdqa32 [rsp + 64*5], F
+ vmovdqa32 [rsp + 64*6], G
+ vmovdqa32 [rsp + 64*7], H
+
+ vmovdqa32 TMP3, [TBL] ; First K
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;using extra 16 ZMM registers instead of heap
+%assign I 0
+%rep 8
+%assign J (I+1)
+ VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64]
+ VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64]
+
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+ vpshufb APPEND(W,J), APPEND(W,J), TMP2
+%assign I (I+2)
+%endrep
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first 48 rounds
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+ ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 64
+ PROCESS_LOOP APPEND(W,J), I
+ %if I < 48
+ MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %endif
+ %if I % 8 = 4
+ PREFETCH_X [mh_in_p + 1024+128*(I / 8)]
+ %endif
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+ ;; add old digest
+ vpaddd A, A, [rsp + 0*64]
+ vpaddd B, B, [rsp + 1*64]
+ vpaddd C, C, [rsp + 2*64]
+ vpaddd D, D, [rsp + 3*64]
+ vpaddd E, E, [rsp + 4*64]
+ vpaddd F, F, [rsp + 5*64]
+ vpaddd G, G, [rsp + 6*64]
+ vpaddd H, H, [rsp + 7*64]
+
+ add mh_in_p, 1024
+ sub loops, 1
+ jne .block_loop
+
+ ; copy segs_digests back to mh_digests_p
+
+ VMOVPS [mh_digests_p + 64*0], A
+ VMOVPS [mh_digests_p + 64*1], B
+ VMOVPS [mh_digests_p + 64*2], C
+ VMOVPS [mh_digests_p + 64*3], D
+ VMOVPS [mh_digests_p + 64*4], E
+ VMOVPS [mh_digests_p + 64*5], F
+ VMOVPS [mh_digests_p + 64*6], G
+ VMOVPS [mh_digests_p + 64*7], H
+
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_mh_sha256_block_avx512
+no_mh_sha256_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c
new file mode 100644
index 000000000..8d9a828c6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_base.c
@@ -0,0 +1,188 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha256_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Base multi-hash SHA256 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// store_w is only used for step 0 ~ 15
+#define store_w(s, i, w, ww) (w[i][s] = to_be32(ww[i*HASH_SEGS+s]))
+#define Ws(x, s) w[(x) & 15][s]
+// update_w is used for step > 15
+#define update_w(s, i, w) \
+ Ws(i, s) = Ws(i-16, s) + S0(Ws(i-15, s)) + Ws(i-7, s) + S1(Ws(i-2, s))
+#define update_t2(s, a, b, c) t2[s] = s0(a[s]) + maj(a[s],b[s],c[s])
+#define update_t1(s, h, e, f, g, i, k) \
+ t1[s] = h[s] + s1(e[s]) + ch(e[s],f[s],g[s]) + k + Ws(i, s);
+#define update_d(s) d[s] += t1[s]
+#define update_h(s) h[s] = t1[s] + t2[s]
+
+// s is a iterator
+#define STORE_W(s, i, w, ww) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ store_w(s, i, w, ww);
+#define UPDATE_W(s, i, w) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_w(s, i, w);
+#define UPDATE_T2(s, a, b, c) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_t2(s, a, b, c);
+#define UPDATE_T1(s, h, e, f, g, i, k) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_t1(s, h, e, f, g, i, k);
+#define UPDATE_D(s) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_d(s);
+#define UPDATE_H(s) \
+ for(s = 0; s < HASH_SEGS; s++) \
+ update_h(s);
+
+static inline void step(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t * f,
+ uint32_t * g, uint32_t * h, uint32_t k,
+ uint32_t * t1, uint32_t * t2, uint32_t(*w)[HASH_SEGS], uint32_t * ww)
+{
+ uint8_t s;
+ if (i < 16) {
+ STORE_W(s, i, w, ww);
+ } else {
+ UPDATE_W(s, i, w);
+ }
+ UPDATE_T2(s, a, b, c);
+ UPDATE_T1(s, h, e, f, g, i, k);
+ UPDATE_D(s);
+ UPDATE_H(s);
+}
+
+static inline void init_abcdefgh(uint32_t * xx, uint32_t n,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS])
+{
+ uint8_t s;
+ for (s = 0; s < HASH_SEGS; s++)
+ xx[s] = digests[n][s];
+}
+
+static inline void add_abcdefgh(uint32_t * xx, uint32_t n,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS])
+{
+ uint8_t s;
+ for (s = 0; s < HASH_SEGS; s++)
+ digests[n][s] += xx[s];
+}
+
+/*
+ * API to perform 0-64 steps of the multi-hash algorithm for
+ * a single block of data. The caller is responsible for ensuring
+ * a full block of data input.
+ *
+ * Argument:
+ * input - the pointer to the data
+ * digest - the space to hold the digests for all segments.
+ *
+ * Return:
+ * N/A
+ */
+void mh_sha256_single(const uint8_t * input, uint32_t(*digests)[HASH_SEGS],
+ uint8_t * frame_buffer)
+{
+ uint8_t i;
+ uint32_t aa[HASH_SEGS], bb[HASH_SEGS], cc[HASH_SEGS], dd[HASH_SEGS];
+ uint32_t ee[HASH_SEGS], ff[HASH_SEGS], gg[HASH_SEGS], hh[HASH_SEGS];
+ uint32_t t1[HASH_SEGS], t2[HASH_SEGS];
+ uint32_t *ww = (uint32_t *) input;
+ uint32_t(*w)[HASH_SEGS];
+
+ const static uint32_t k[64] = {
+ 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+ 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+ 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+ 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+ 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+ 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+ 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+ 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+ 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+ 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+ 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+ 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+ 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+ 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+ 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+ 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+ };
+
+ w = (uint32_t(*)[HASH_SEGS]) frame_buffer;
+
+ init_abcdefgh(aa, 0, digests);
+ init_abcdefgh(bb, 1, digests);
+ init_abcdefgh(cc, 2, digests);
+ init_abcdefgh(dd, 3, digests);
+ init_abcdefgh(ee, 4, digests);
+ init_abcdefgh(ff, 5, digests);
+ init_abcdefgh(gg, 6, digests);
+ init_abcdefgh(hh, 7, digests);
+
+ for (i = 0; i < 64; i += 8) {
+ step(i, aa, bb, cc, dd, ee, ff, gg, hh, k[i], t1, t2, w, ww);
+ step(i + 1, hh, aa, bb, cc, dd, ee, ff, gg, k[i + 1], t1, t2, w, ww);
+ step(i + 2, gg, hh, aa, bb, cc, dd, ee, ff, k[i + 2], t1, t2, w, ww);
+ step(i + 3, ff, gg, hh, aa, bb, cc, dd, ee, k[i + 3], t1, t2, w, ww);
+ step(i + 4, ee, ff, gg, hh, aa, bb, cc, dd, k[i + 4], t1, t2, w, ww);
+ step(i + 5, dd, ee, ff, gg, hh, aa, bb, cc, k[i + 5], t1, t2, w, ww);
+ step(i + 6, cc, dd, ee, ff, gg, hh, aa, bb, k[i + 6], t1, t2, w, ww);
+ step(i + 7, bb, cc, dd, ee, ff, gg, hh, aa, k[i + 7], t1, t2, w, ww);
+ }
+
+ add_abcdefgh(aa, 0, digests);
+ add_abcdefgh(bb, 1, digests);
+ add_abcdefgh(cc, 2, digests);
+ add_abcdefgh(dd, 3, digests);
+ add_abcdefgh(ee, 4, digests);
+ add_abcdefgh(ff, 5, digests);
+ add_abcdefgh(gg, 6, digests);
+ add_abcdefgh(hh, 7, digests);
+}
+
+void mh_sha256_block_base(const uint8_t * input_data,
+ uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks)
+{
+ uint32_t i;
+
+ for (i = 0; i < num_blocks; i++) {
+ mh_sha256_single(input_data, digests, frame_buffer);
+ input_data += MH_SHA256_BLOCK_SIZE;
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm
new file mode 100644
index 000000000..b1d6fd9ea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_block_sse.asm
@@ -0,0 +1,557 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA256 using SSE
+;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha256
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*8*16 ;BYTES*DWORDS*SEGS
+
+; Common definitions
+%define ROUND tmp4
+%define TBL tmp5
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define MOVPS movups
+
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%reg, %%imm
+ pslld %%tmp, (32-(%%imm))
+ por %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_R 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORD a0, (11-6) ; sig1: a0 = (e >> 5)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORD a1, 25 ; sig1: a1 = (e >> 25)
+ movdqa %%T1,[SZ4*(%%i&0xf) + %%data]
+ paddd %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ paddd h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORD a2, (13-2) ; sig0: a2 = (a >> 11)
+ paddd h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORD a1, 22 ; sig0: a1 = (a >> 22)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddd h, a0
+
+ paddd d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddd h, a1 ; h = h + ch + W + K + maj
+ paddd h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15_W 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORD a0, (11-6) ; sig1: a0 = (e >> 5)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORD a1, 25 ; sig1: a1 = (e >> 25)
+ movdqa [SZ4*(%%i&0xf) + %%data], %%T1
+ paddd %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ paddd h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORD a2, (13-2) ; sig0: a2 = (a >> 11)
+ paddd h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORD a1, 22 ; sig0: a1 = (a >> 22)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddd h, a0
+
+ paddd d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddd h, a1 ; h = h + ch + W + K + maj
+ paddd h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 3
+%define %%T1 %1
+%define %%i %2
+%define %%data %3
+
+ movdqa %%T1, [SZ4*((%%i-15)&0xf) + %%data]
+ movdqa a1, [SZ4*((%%i-2)&0xf) + %%data]
+ movdqa a0, %%T1
+ PRORD %%T1, 18-7
+ movdqa a2, a1
+ PRORD a1, 19-17
+ pxor %%T1, a0
+ PRORD %%T1, 7
+ pxor a1, a2
+ PRORD a1, 17
+ psrld a0, 3
+ pxor %%T1, a0
+ psrld a2, 10
+ pxor a1, a2
+ paddd %%T1, [SZ4*((%%i-16)&0xf) + %%data]
+ paddd a1, [SZ4*((%%i-7)&0xf) + %%data]
+ paddd %%T1, a1
+
+ ROUND_00_15_W %%T1, %%i, %%data
+
+%endm
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h7 | h7 | h7 | ...| h7 | | Ha| Hb | Hc |...| Hp |
+
+align 32
+
+;void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][8])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+mk_global mh_sha256_block_sse, function, internal
+func(mh_sha256_block_sse)
+ endbranch
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by sse
+ and rsp, ~0x0F
+ lea TBL,[TABLE]
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 8
+ MOVPS a, [mh_digests_p + I*64 + 16*0]
+ MOVPS b, [mh_digests_p + I*64 + 16*1]
+ MOVPS c, [mh_digests_p + I*64 + 16*2]
+ MOVPS d, [mh_digests_p + I*64 + 16*3]
+
+ movdqa [rsp + I*64 + 16*0], a
+ movdqa [rsp + I*64 + 16*1], b
+ movdqa [rsp + I*64 + 16*2], c
+ movdqa [rsp + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*8 to DWORD*4_SEGS*8*4
+ %assign I 0
+ %rep 16
+ MOVPS TT0,[mh_in_p + I*64+0*16]
+ MOVPS TT1,[mh_in_p + I*64+1*16]
+ MOVPS TT2,[mh_in_p + I*64+2*16]
+ MOVPS TT3,[mh_in_p + I*64+3*16]
+
+ pshufb TT0, TMP
+ movdqa [mh_data_p +(I)*16 +0*256],TT0
+ pshufb TT1, TMP
+ movdqa [mh_data_p +(I)*16 +1*256],TT1
+ pshufb TT2, TMP
+ movdqa [mh_data_p +(I)*16 +2*256],TT2
+ pshufb TT3, TMP
+ movdqa [mh_data_p +(I)*16 +3*256],TT3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ xor ROUND, ROUND
+ ;; Initialize digests
+ movdqa a, [rsp + 0*64 + mh_segs]
+ movdqa b, [rsp + 1*64 + mh_segs]
+ movdqa c, [rsp + 2*64 + mh_segs]
+ movdqa d, [rsp + 3*64 + mh_segs]
+ movdqa e, [rsp + 4*64 + mh_segs]
+ movdqa f, [rsp + 5*64 + mh_segs]
+ movdqa g, [rsp + 6*64 + mh_segs]
+ movdqa h, [rsp + 7*64 + mh_segs]
+
+ %assign i 0
+ %rep 4
+ ROUND_00_15_R TT0, (i*4+0), mh_data_p
+ ROUND_00_15_R TT1, (i*4+1), mh_data_p
+ ROUND_00_15_R TT2, (i*4+2), mh_data_p
+ ROUND_00_15_R TT3, (i*4+3), mh_data_p
+ %assign i (i+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+
+ %assign i 16
+ %rep 48
+ %if i = 48
+ PREFETCH_X [mh_in_p + pref+128*1]
+ %endif
+ ROUND_16_XX T1, i, mh_data_p
+ %assign i (i+1)
+ %endrep
+
+ ;; add old digest
+ paddd a, [rsp + 0*64 + mh_segs]
+ paddd b, [rsp + 1*64 + mh_segs]
+ paddd c, [rsp + 2*64 + mh_segs]
+ paddd d, [rsp + 3*64 + mh_segs]
+ paddd e, [rsp + 4*64 + mh_segs]
+ paddd f, [rsp + 5*64 + mh_segs]
+ paddd g, [rsp + 6*64 + mh_segs]
+ paddd h, [rsp + 7*64 + mh_segs]
+
+ ; write out digests
+ movdqa [rsp + 0*64 + mh_segs], a
+ movdqa [rsp + 1*64 + mh_segs], b
+ movdqa [rsp + 2*64 + mh_segs], c
+ movdqa [rsp + 3*64 + mh_segs], d
+ movdqa [rsp + 4*64 + mh_segs], e
+ movdqa [rsp + 5*64 + mh_segs], f
+ movdqa [rsp + 6*64 + mh_segs], g
+ movdqa [rsp + 7*64 + mh_segs], h
+
+ add pref, 256
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 8
+ movdqa a, [rsp + I*64 + 16*0]
+ movdqa b, [rsp + I*64 + 16*1]
+ movdqa c, [rsp + I*64 + 16*2]
+ movdqa d, [rsp + I*64 + 16*3]
+
+ MOVPS [mh_digests_p + I*64 + 16*0], a
+ MOVPS [mh_digests_p + I*64 + 16*1], b
+ MOVPS [mh_digests_p + I*64 + 16*2], c
+ MOVPS [mh_digests_p + I*64 + 16*3], d
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c
new file mode 100644
index 000000000..6abb20688
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_finalize_base.c
@@ -0,0 +1,121 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha256_finalize_base.c contains the prototypes of mh_sha256_finalize_XXX
+ * and mh_sha256_tail_XXX. Default definitions are base type which generates
+ * mh_sha256_finalize_base and mh_sha256_tail_base. Other types are generated
+ * through different predefined macros by mh_sha256.c.
+ * mh_sha256_tail is used to calculate the last incomplete block of input
+ * data. mh_sha256_finalize is the mh_sha256_ctx wrapper of mh_sha256_tail.
+ */
+#ifndef MH_SHA256_FINALIZE_FUNCTION
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+#define MH_SHA256_FINALIZE_FUNCTION mh_sha256_finalize_base
+#define MH_SHA256_TAIL_FUNCTION mh_sha256_tail_base
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_base
+#define MH_SHA256_FINALIZE_SLVER
+#endif
+
+void MH_SHA256_TAIL_FUNCTION(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t * frame_buffer, uint32_t digests[SHA256_DIGEST_WORDS])
+{
+ uint64_t partial_buffer_len, len_in_bit;
+
+ partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE;
+
+ // Padding the first block
+ partial_buffer[partial_buffer_len] = 0x80;
+ partial_buffer_len++;
+ memset(partial_buffer + partial_buffer_len, 0,
+ MH_SHA256_BLOCK_SIZE - partial_buffer_len);
+
+ // Calculate the first block without total_length if padding needs 2 block
+ if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) {
+ MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer,
+ 1);
+ //Padding the second block
+ memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE);
+ }
+ //Padding the block
+ len_in_bit = to_be64((uint64_t) total_len * 8);
+ *(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit;
+ MH_SHA256_BLOCK_FUNCTION(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1);
+
+ //Calculate multi-hash SHA256 digests (segment digests as input message)
+ sha256_for_mh_sha256((uint8_t *) mh_sha256_segs_digests, digests,
+ 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+int MH_SHA256_FINALIZE_FUNCTION(struct mh_sha256_ctx *ctx, void *mh_sha256_digest)
+{
+ uint8_t i;
+ uint8_t *partial_block_buffer;
+ uint64_t total_len;
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+
+ if (ctx == NULL)
+ return MH_SHA256_CTX_ERROR_NULL;
+
+ total_len = ctx->total_length;
+ partial_block_buffer = ctx->partial_block_buffer;
+
+ /* mh_sha256 tail */
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests;
+
+ MH_SHA256_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha256_segs_digests,
+ aligned_frame_buffer, ctx->mh_sha256_digest);
+
+ /* Output the digests of mh_sha256 */
+ if (mh_sha256_digest != NULL) {
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++)
+ ((uint32_t *) mh_sha256_digest)[i] = ctx->mh_sha256_digest[i];
+ }
+
+ return MH_SHA256_CTX_ERROR_NONE;
+}
+
+#ifdef MH_SHA256_FINALIZE_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver mh_sha256_finalize_base_slver_000002bb;
+struct slver mh_sha256_finalize_base_slver = { 0x02bb, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h
new file mode 100644
index 000000000..8051e3f36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_internal.h
@@ -0,0 +1,318 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA256_INTERNAL_H_
+#define _MH_SHA256_INTERNAL_H_
+
+/**
+ * @file mh_sha256_internal.h
+ * @brief mh_sha256 internal function prototypes and macros
+ *
+ * Interface for mh_sha256 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha256.h"
+#include "endian_helper.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+ // 64byte pointer align
+#define ALIGN_64(pointer) ( ((uint64_t)(pointer) + 0x3F)&(~0x3F) )
+
+ /*******************************************************************
+ *mh_sha256 constants and macros
+ ******************************************************************/
+ /* mh_sha256 constants */
+#define MH_SHA256_H0 0x6a09e667UL
+#define MH_SHA256_H1 0xbb67ae85UL
+#define MH_SHA256_H2 0x3c6ef372UL
+#define MH_SHA256_H3 0xa54ff53aUL
+#define MH_SHA256_H4 0x510e527fUL
+#define MH_SHA256_H5 0x9b05688cUL
+#define MH_SHA256_H6 0x1f83d9abUL
+#define MH_SHA256_H7 0x5be0cd19UL
+
+ /* mh_sha256 macros */
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+ /*******************************************************************
+ * SHA256 API internal function prototypes
+ ******************************************************************/
+
+ /**
+ * @brief Performs complete SHA256 algorithm.
+ *
+ * @param input Pointer to buffer containing the input message.
+ * @param digest Pointer to digest to update.
+ * @param len Length of buffer.
+ * @returns None
+ */
+ void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len);
+
+ /**
+ * @brief Calculate sha256 digest of blocks which size is SHA256_BLOCK_SIZE
+ *
+ * @param data Pointer to data buffer containing the input message.
+ * @param digest Pointer to sha256 digest.
+ * @returns None
+ */
+ void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[]);
+
+ /*******************************************************************
+ * mh_sha256 API internal function prototypes
+ * Multiple versions of Update and Finalize functions are supplied which use
+ * multiple versions of block and tail process subfunctions.
+ ******************************************************************/
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @returns none
+ *
+ */
+ void mh_sha256_tail(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_base(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @requires SSE
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_sse(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @requires AVX
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_avx(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @requires AVX2
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_avx2(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha256.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA256_BLOCK_SIZE.
+ * It will output the final SHA256 digest based on mh_sha256_segs_digests.
+ *
+ * @requires AVX512
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha256_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha256_digest mh_sha256 digest
+ * @returns none
+ *
+ */
+ void mh_sha256_tail_avx512(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha256_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha256_digest[SHA256_DIGEST_WORDS]);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_base(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @requires SSE
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_sse(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @requires AVX
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_avx(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @requires AVX2
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_avx2(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha256 digest of blocks which size is MH_SHA256_BLOCK_SIZE*N.
+ *
+ * @requires AVX512
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha256_block_avx512(const uint8_t * input_data, uint32_t digests[SHA256_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm
new file mode 100644
index 000000000..e14fc7eb1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_multibinary.asm
@@ -0,0 +1,77 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ [bits 32]
+%else
+ default rel
+ [bits 64]
+
+ extern mh_sha256_update_sse
+ extern mh_sha256_update_avx
+ extern mh_sha256_update_avx2
+ extern mh_sha256_finalize_sse
+ extern mh_sha256_finalize_avx
+ extern mh_sha256_finalize_avx2
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ extern mh_sha256_update_avx512
+ extern mh_sha256_finalize_avx512
+ %endif
+
+%endif
+
+extern mh_sha256_update_base
+extern mh_sha256_finalize_base
+
+mbin_interface mh_sha256_update
+mbin_interface mh_sha256_finalize
+
+%ifidn __OUTPUT_FORMAT__, elf64
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2, mh_sha256_update_avx512
+ mbin_dispatch_init6 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2, mh_sha256_finalize_avx512
+ %else
+ mbin_dispatch_init5 mh_sha256_update, mh_sha256_update_base, mh_sha256_update_sse, mh_sha256_update_avx, mh_sha256_update_avx2
+ mbin_dispatch_init5 mh_sha256_finalize, mh_sha256_finalize_base, mh_sha256_finalize_sse, mh_sha256_finalize_avx, mh_sha256_finalize_avx2
+ %endif
+
+%else
+ mbin_dispatch_init2 mh_sha256_update, mh_sha256_update_base
+ mbin_dispatch_init2 mh_sha256_finalize, mh_sha256_finalize_base
+%endif
+
+;;; func core, ver, snum
+slversion mh_sha256_update, 00, 00, 02b2
+slversion mh_sha256_finalize, 00, 00, 02b3
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c
new file mode 100644
index 000000000..8095e4f05
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_perf.c
@@ -0,0 +1,180 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha256.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_LEN 16*1024
+# define TEST_LOOPS 20000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define TEST_LEN 16*1024*1024
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#define TEST_MEM TEST_LEN
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA256_FUNC_TYPE
+#define MH_SHA256_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA256_CTX_ERROR_NONE){ \
+ printf("The mh_sha256 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 32 == 0)
+ printf("\n");
+ }
+ if (i % 32 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA256_DIGEST_WORDS],
+ uint32_t hash_test[SHA256_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha256_fail = 0;
+
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha256_fail++;
+ }
+
+ if (mh_sha256_fail) {
+ printf("mh_sha256 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 32);
+ printf("ref: ");
+ dump((char *)hash_test, 32);
+ }
+
+ return mh_sha256_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, fail = 0;
+ uint32_t hash_test[SHA256_DIGEST_WORDS], hash_base[SHA256_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ struct mh_sha256_ctx *update_ctx_test = NULL, *update_ctx_base = NULL;
+ struct perf start, stop;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n");
+
+ buff = malloc(TEST_LEN);
+ update_ctx_test = malloc(sizeof(*update_ctx_test));
+ update_ctx_base = malloc(sizeof(*update_ctx_base));
+
+ if (buff == NULL || update_ctx_base == NULL || update_ctx_test == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ // mh_sha256 base version
+ mh_sha256_init(update_ctx_base);
+ mh_sha256_update_base(update_ctx_base, buff, TEST_LEN);
+ mh_sha256_finalize_base(update_ctx_base, hash_base);
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS / 10; i++) {
+ mh_sha256_init(update_ctx_base);
+ mh_sha256_update_base(update_ctx_base, buff, TEST_LEN);
+ mh_sha256_finalize_base(update_ctx_base, hash_base);
+ }
+ perf_stop(&stop);
+ printf("mh_sha256_update_base" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ //Update feature test
+ CHECK_RETURN(mh_sha256_init(update_ctx_test));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ CHECK_RETURN(mh_sha256_init(update_ctx_test));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+ }
+ perf_stop(&stop);
+ printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ // Check results
+ fail = compare_digests(hash_base, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", TEST_LEN);
+ return -1;
+ }
+
+ if (fail)
+ printf("Test failed function test%d\n", fail);
+ else
+ printf("Pass func check\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c
new file mode 100644
index 000000000..2aaefecb0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_ref.c
@@ -0,0 +1,410 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha256_internal.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+ // Macros and sub-functions which already exist in source code file
+ // (sha256_for_mh_sha256.c) is part of ISA-L library as internal functions.
+ // The reason why writing them twice is the linking issue caused by
+ // mh_sha256_ref(). mh_sha256_ref() needs these macros and sub-functions
+ // without linking ISA-L library. So mh_sha256_ref() includes them in
+ // order to contain essential sub-functions in its own object file.
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be32(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+void sha256_single_for_mh_sha256_ref(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e, f, g, h, t1, t2;
+ uint32_t w[16];
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+ step(1, h, a, b, c, d, e, f, g, 0x71374491);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+ step(31, b, c, d, e, f, g, h, a, 0x14292967);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+ step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+ step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
+
+void sha256_for_mh_sha256_ref(const uint8_t * input_data, uint32_t * digest,
+ const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+ digest[0] = MH_SHA256_H0;
+ digest[1] = MH_SHA256_H1;
+ digest[2] = MH_SHA256_H2;
+ digest[3] = MH_SHA256_H3;
+ digest[4] = MH_SHA256_H4;
+ digest[5] = MH_SHA256_H5;
+ digest[6] = MH_SHA256_H6;
+ digest[7] = MH_SHA256_H7;
+
+ i = len;
+ while (i >= SHA256_BLOCK_SIZE) {
+ sha256_single_for_mh_sha256_ref(input_data, digest);
+ input_data += SHA256_BLOCK_SIZE;
+ i -= SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++)
+ buf[j] = 0;
+
+ if (i > SHA256_BLOCK_SIZE - 8)
+ i = 2 * SHA256_BLOCK_SIZE;
+ else
+ i = SHA256_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha256_single_for_mh_sha256_ref(buf, digest);
+ if (i == (2 * SHA256_BLOCK_SIZE))
+ sha256_single_for_mh_sha256_ref(buf + SHA256_BLOCK_SIZE, digest);
+}
+
+/*
+ * buffer to rearrange one segment data from one block.
+ *
+ * Layout of new_data:
+ * segment
+ * -------------------------
+ * w0 | w1 | ... | w15
+ *
+ */
+static inline void transform_input_single(uint32_t * new_data, uint32_t * input,
+ uint32_t segment)
+{
+ new_data[16 * segment + 0] = input[16 * 0 + segment];
+ new_data[16 * segment + 1] = input[16 * 1 + segment];
+ new_data[16 * segment + 2] = input[16 * 2 + segment];
+ new_data[16 * segment + 3] = input[16 * 3 + segment];
+ new_data[16 * segment + 4] = input[16 * 4 + segment];
+ new_data[16 * segment + 5] = input[16 * 5 + segment];
+ new_data[16 * segment + 6] = input[16 * 6 + segment];
+ new_data[16 * segment + 7] = input[16 * 7 + segment];
+ new_data[16 * segment + 8] = input[16 * 8 + segment];
+ new_data[16 * segment + 9] = input[16 * 9 + segment];
+ new_data[16 * segment + 10] = input[16 * 10 + segment];
+ new_data[16 * segment + 11] = input[16 * 11 + segment];
+ new_data[16 * segment + 12] = input[16 * 12 + segment];
+ new_data[16 * segment + 13] = input[16 * 13 + segment];
+ new_data[16 * segment + 14] = input[16 * 14 + segment];
+ new_data[16 * segment + 15] = input[16 * 15 + segment];
+}
+
+// Adapt parameters to sha256_single_for_mh_sha256_ref
+#define sha256_update_one_seg(data, digest) \
+ sha256_single_for_mh_sha256_ref((const uint8_t *)(data), (uint32_t *)(digest))
+
+/*
+ * buffer to Rearrange all segments data from one block.
+ *
+ * Layout of new_data:
+ * segment
+ * -------------------------
+ * seg0: | w0 | w1 | ... | w15
+ * seg1: | w0 | w1 | ... | w15
+ * seg2: | w0 | w1 | ... | w15
+ * ....
+ * seg15: | w0 | w1 | ... | w15
+ *
+ */
+static inline void transform_input(uint32_t * new_data, uint32_t * input, uint32_t block)
+{
+ uint32_t *current_input = input + block * MH_SHA256_BLOCK_SIZE / 4;
+
+ transform_input_single(new_data, current_input, 0);
+ transform_input_single(new_data, current_input, 1);
+ transform_input_single(new_data, current_input, 2);
+ transform_input_single(new_data, current_input, 3);
+ transform_input_single(new_data, current_input, 4);
+ transform_input_single(new_data, current_input, 5);
+ transform_input_single(new_data, current_input, 6);
+ transform_input_single(new_data, current_input, 7);
+ transform_input_single(new_data, current_input, 8);
+ transform_input_single(new_data, current_input, 9);
+ transform_input_single(new_data, current_input, 10);
+ transform_input_single(new_data, current_input, 11);
+ transform_input_single(new_data, current_input, 12);
+ transform_input_single(new_data, current_input, 13);
+ transform_input_single(new_data, current_input, 14);
+ transform_input_single(new_data, current_input, 15);
+
+}
+
+/*
+ * buffer to Calculate all segments' digests from one block.
+ *
+ * Layout of seg_digest:
+ * segment
+ * -------------------------
+ * seg0: | H0 | H1 | ... | H7
+ * seg1: | H0 | H1 | ... | H7
+ * seg2: | H0 | H1 | ... | H7
+ * ....
+ * seg15: | H0 | H1 | ... | H7
+ *
+ */
+static inline void sha256_update_all_segs(uint32_t * new_data, uint32_t(*mh_sha256_seg_digests)
+ [SHA256_DIGEST_WORDS])
+{
+ sha256_update_one_seg(&(new_data)[16 * 0], mh_sha256_seg_digests[0]);
+ sha256_update_one_seg(&(new_data)[16 * 1], mh_sha256_seg_digests[1]);
+ sha256_update_one_seg(&(new_data)[16 * 2], mh_sha256_seg_digests[2]);
+ sha256_update_one_seg(&(new_data)[16 * 3], mh_sha256_seg_digests[3]);
+ sha256_update_one_seg(&(new_data)[16 * 4], mh_sha256_seg_digests[4]);
+ sha256_update_one_seg(&(new_data)[16 * 5], mh_sha256_seg_digests[5]);
+ sha256_update_one_seg(&(new_data)[16 * 6], mh_sha256_seg_digests[6]);
+ sha256_update_one_seg(&(new_data)[16 * 7], mh_sha256_seg_digests[7]);
+ sha256_update_one_seg(&(new_data)[16 * 8], mh_sha256_seg_digests[8]);
+ sha256_update_one_seg(&(new_data)[16 * 9], mh_sha256_seg_digests[9]);
+ sha256_update_one_seg(&(new_data)[16 * 10], mh_sha256_seg_digests[10]);
+ sha256_update_one_seg(&(new_data)[16 * 11], mh_sha256_seg_digests[11]);
+ sha256_update_one_seg(&(new_data)[16 * 12], mh_sha256_seg_digests[12]);
+ sha256_update_one_seg(&(new_data)[16 * 13], mh_sha256_seg_digests[13]);
+ sha256_update_one_seg(&(new_data)[16 * 14], mh_sha256_seg_digests[14]);
+ sha256_update_one_seg(&(new_data)[16 * 15], mh_sha256_seg_digests[15]);
+}
+
+void mh_sha256_block_ref(const uint8_t * input_data, uint32_t(*digests)[HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE], uint32_t num_blocks)
+{
+ uint32_t i, j;
+ uint32_t *temp_buffer = (uint32_t *) frame_buffer;
+ uint32_t(*trans_digests)[SHA256_DIGEST_WORDS];
+
+ trans_digests = (uint32_t(*)[SHA256_DIGEST_WORDS]) digests;
+
+ // Re-structure seg_digests from 5*16 to 16*5
+ for (j = 0; j < HASH_SEGS; j++) {
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ temp_buffer[j * SHA256_DIGEST_WORDS + i] = digests[i][j];
+ }
+ }
+ memcpy(trans_digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+ // Calculate digests for all segments, leveraging sha256 API
+ for (i = 0; i < num_blocks; i++) {
+ transform_input(temp_buffer, (uint32_t *) input_data, i);
+ sha256_update_all_segs(temp_buffer, trans_digests);
+ }
+
+ // Re-structure seg_digests from 16*5 to 5*16
+ for (j = 0; j < HASH_SEGS; j++) {
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ temp_buffer[i * HASH_SEGS + j] = trans_digests[j][i];
+ }
+ }
+ memcpy(digests, temp_buffer, 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+void mh_sha256_tail_ref(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS], uint8_t * frame_buffer,
+ uint32_t digests[SHA256_DIGEST_WORDS])
+{
+ uint64_t partial_buffer_len, len_in_bit;
+
+ partial_buffer_len = total_len % MH_SHA256_BLOCK_SIZE;
+
+ // Padding the first block
+ partial_buffer[partial_buffer_len] = 0x80;
+ partial_buffer_len++;
+ memset(partial_buffer + partial_buffer_len, 0,
+ MH_SHA256_BLOCK_SIZE - partial_buffer_len);
+
+ // Calculate the first block without total_length if padding needs 2 block
+ if (partial_buffer_len > (MH_SHA256_BLOCK_SIZE - 8)) {
+ mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1);
+ //Padding the second block
+ memset(partial_buffer, 0, MH_SHA256_BLOCK_SIZE);
+ }
+ //Padding the block
+ len_in_bit = to_be64((uint64_t) total_len * 8);
+ *(uint64_t *) (partial_buffer + MH_SHA256_BLOCK_SIZE - 8) = len_in_bit;
+ mh_sha256_block_ref(partial_buffer, mh_sha256_segs_digests, frame_buffer, 1);
+
+ //Calculate multi-hash SHA256 digests (segment digests as input message)
+ sha256_for_mh_sha256_ref((uint8_t *) mh_sha256_segs_digests, digests,
+ 4 * SHA256_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest)
+{
+ uint64_t total_len;
+ uint64_t num_blocks;
+ uint32_t mh_sha256_segs_digests[SHA256_DIGEST_WORDS][HASH_SEGS];
+ uint8_t frame_buffer[MH_SHA256_BLOCK_SIZE];
+ uint8_t partial_block_buffer[MH_SHA256_BLOCK_SIZE * 2];
+ uint32_t mh_sha256_hash_dword[SHA256_DIGEST_WORDS];
+ uint32_t i;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ /* Initialize digests of all segments */
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha256_segs_digests[0][i] = MH_SHA256_H0;
+ mh_sha256_segs_digests[1][i] = MH_SHA256_H1;
+ mh_sha256_segs_digests[2][i] = MH_SHA256_H2;
+ mh_sha256_segs_digests[3][i] = MH_SHA256_H3;
+ mh_sha256_segs_digests[4][i] = MH_SHA256_H4;
+ mh_sha256_segs_digests[5][i] = MH_SHA256_H5;
+ mh_sha256_segs_digests[6][i] = MH_SHA256_H6;
+ mh_sha256_segs_digests[7][i] = MH_SHA256_H7;
+ }
+
+ total_len = len;
+
+ // Calculate blocks
+ num_blocks = len / MH_SHA256_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ mh_sha256_block_ref(input_data, mh_sha256_segs_digests, frame_buffer,
+ num_blocks);
+ len -= num_blocks * MH_SHA256_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA256_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ /* Finalize */
+ mh_sha256_tail_ref(partial_block_buffer, total_len, mh_sha256_segs_digests,
+ frame_buffer, mh_sha256_hash_dword);
+
+ // Output the digests of mh_sha256
+ if (mh_sha256_digest != NULL) {
+ mh_sha256_digest[0] = mh_sha256_hash_dword[0];
+ mh_sha256_digest[1] = mh_sha256_hash_dword[1];
+ mh_sha256_digest[2] = mh_sha256_hash_dword[2];
+ mh_sha256_digest[3] = mh_sha256_hash_dword[3];
+ mh_sha256_digest[4] = mh_sha256_hash_dword[4];
+ mh_sha256_digest[5] = mh_sha256_hash_dword[5];
+ mh_sha256_digest[6] = mh_sha256_hash_dword[6];
+ mh_sha256_digest[7] = mh_sha256_hash_dword[7];
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c
new file mode 100644
index 000000000..13ab91c16
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_test.c
@@ -0,0 +1,217 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha256.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA256_FUNC_TYPE
+#define MH_SHA256_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA256_CTX_ERROR_NONE){ \
+ printf("The mh_sha256 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest);
+#define MH_SHA256_REF mh_sha256_ref
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 32 == 0)
+ printf("\n");
+ }
+ if (i % 32 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS],
+ uint32_t hash_test[SHA256_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha256_fail = 0;
+
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_ref[i])
+ mh_sha256_fail++;
+ }
+
+ if (mh_sha256_fail) {
+ printf("mh_sha256 fail test\n");
+ printf("ref: ");
+ dump((char *)hash_ref, 32);
+ printf("test: ");
+ dump((char *)hash_test, 32);
+ }
+
+ return mh_sha256_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0;
+ uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int size, offset;
+ struct mh_sha256_ctx *update_ctx = NULL;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:\n");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ MH_SHA256_REF(buff, TEST_LEN, hash_ref);
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages
+ for (size = TEST_LEN; size >= 0; size--) {
+
+ // Fill with rand data
+ rand_buffer(buff, size);
+
+ MH_SHA256_REF(buff, size, hash_ref);
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various buffer offsets and sizes
+ printf("offset tests");
+ for (size = TEST_LEN - 256; size > 256; size -= 11) {
+ for (offset = 0; offset < 256; offset++) {
+ MH_SHA256_REF(buff + offset, size, hash_ref);
+
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ }
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Run efence tests
+ printf("efence tests");
+ for (size = TEST_SIZE; size > 0; size--) {
+ offset = TEST_LEN - size;
+
+ MH_SHA256_REF(buff + offset, size, hash_ref);
+
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+ printf(" %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c
new file mode 100644
index 000000000..024ae2b91
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_base.c
@@ -0,0 +1,110 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha256_update_base.c contains the prototype of mh_sha256_update_XXX.
+ * Default definitions are base type which generates mh_sha256_update_base.
+ * Other types are generated through different predefined macros by mh_sha256.c.
+ */
+#ifndef MH_SHA256_UPDATE_FUNCTION
+#include "mh_sha256_internal.h"
+#include <string.h>
+
+#define MH_SHA256_UPDATE_FUNCTION mh_sha256_update_base
+#define MH_SHA256_BLOCK_FUNCTION mh_sha256_block_base
+#define MH_SHA256_UPDATE_SLVER
+#endif
+
+int MH_SHA256_UPDATE_FUNCTION(struct mh_sha256_ctx *ctx, const void *buffer, uint32_t len)
+{
+
+ uint8_t *partial_block_buffer;
+ uint64_t partial_block_len;
+ uint64_t num_blocks;
+ uint32_t(*mh_sha256_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ if (ctx == NULL)
+ return MH_SHA256_CTX_ERROR_NULL;
+
+ if (len == 0)
+ return MH_SHA256_CTX_ERROR_NONE;
+
+ partial_block_len = ctx->total_length % MH_SHA256_BLOCK_SIZE;
+ partial_block_buffer = ctx->partial_block_buffer;
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha256_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha256_interim_digests;
+
+ ctx->total_length += len;
+ // No enough input data for mh_sha256 calculation
+ if (len + partial_block_len < MH_SHA256_BLOCK_SIZE) {
+ memcpy(partial_block_buffer + partial_block_len, input_data, len);
+ return MH_SHA256_CTX_ERROR_NONE;
+ }
+ // mh_sha256 calculation for the previous partial block
+ if (partial_block_len != 0) {
+ memcpy(partial_block_buffer + partial_block_len, input_data,
+ MH_SHA256_BLOCK_SIZE - partial_block_len);
+ //do one_block process
+ MH_SHA256_BLOCK_FUNCTION(partial_block_buffer, mh_sha256_segs_digests,
+ aligned_frame_buffer, 1);
+ input_data += MH_SHA256_BLOCK_SIZE - partial_block_len;
+ len -= MH_SHA256_BLOCK_SIZE - partial_block_len;
+ memset(partial_block_buffer, 0, MH_SHA256_BLOCK_SIZE);
+ }
+ // Calculate mh_sha256 for the current blocks
+ num_blocks = len / MH_SHA256_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ MH_SHA256_BLOCK_FUNCTION(input_data, mh_sha256_segs_digests,
+ aligned_frame_buffer, num_blocks);
+ len -= num_blocks * MH_SHA256_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA256_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ return MH_SHA256_CTX_ERROR_NONE;
+
+}
+
+#ifdef MH_SHA256_UPDATE_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver mh_sha256_update_base_slver_000002ba;
+struct slver mh_sha256_update_base_slver = { 0x02ba, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c
new file mode 100644
index 000000000..f5b28bba7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/mh_sha256_update_test.c
@@ -0,0 +1,240 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha256.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA256_FUNC_TYPE
+#define MH_SHA256_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha256_update, MH_SHA256_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha256_finalize, MH_SHA256_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA256_CTX_ERROR_NONE){ \
+ printf("The mh_sha256 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha256_ref(const void *buffer, uint32_t len, uint32_t * mh_sha256_digest);
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_ref[SHA256_DIGEST_WORDS],
+ uint32_t hash_test[SHA256_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha256_fail = 0;
+
+ for (i = 0; i < SHA256_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_ref[i])
+ mh_sha256_fail++;
+ }
+
+ if (mh_sha256_fail) {
+ printf("mh_sha256 fail test\n");
+ printf("ref: ");
+ dump((char *)hash_ref, 20);
+ printf("test: ");
+ dump((char *)hash_test, 20);
+ }
+
+ return mh_sha256_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0, i;
+ uint32_t hash_test[SHA256_DIGEST_WORDS], hash_ref[SHA256_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int update_count;
+ int size1, size2, offset, addr_offset;
+ struct mh_sha256_ctx *update_ctx = NULL;
+ uint8_t *mem_addr = NULL;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages by update twice.
+ printf("\n various size messages by update twice tests");
+ for (size1 = TEST_LEN; size1 >= 0; size1--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+ // subsequent update
+ size2 = TEST_LEN - size1; // size2 is different with the former
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various update count
+ printf("\n various update count tests");
+ for (update_count = 1; update_count <= TEST_LEN; update_count++) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+ // subsequent update
+ size1 = TEST_LEN / update_count;
+ size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former
+
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ for (i = 1, offset = 0; i < update_count; i++) {
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1));
+ offset += size1;
+ }
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // test various start address of ctx.
+ printf("\n various start address of ctx test");
+ free(update_ctx);
+ mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10);
+ for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha256_ref(buff, TEST_LEN, hash_ref);
+
+ // a unaligned offset
+ update_ctx = (struct mh_sha256_ctx *)(mem_addr + addr_offset);
+ CHECK_RETURN(mh_sha256_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail addr_offset=%d\n", addr_offset);
+ return -1;
+ }
+
+ if ((addr_offset & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c
new file mode 100644
index 000000000..ea8c9f436
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha256/sha256_for_mh_sha256.c
@@ -0,0 +1,176 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha256_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA256 Functions for mh_sha256
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be32(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+void sha256_single_for_mh_sha256(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e, f, g, h, t1, t2;
+ uint32_t w[16];
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+ step(1, h, a, b, c, d, e, f, g, 0x71374491);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+ step(31, b, c, d, e, f, g, h, a, 0x14292967);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+ step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+ step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
+
+void sha256_for_mh_sha256(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+ digest[0] = MH_SHA256_H0;
+ digest[1] = MH_SHA256_H1;
+ digest[2] = MH_SHA256_H2;
+ digest[3] = MH_SHA256_H3;
+ digest[4] = MH_SHA256_H4;
+ digest[5] = MH_SHA256_H5;
+ digest[6] = MH_SHA256_H6;
+ digest[7] = MH_SHA256_H7;
+
+ i = len;
+ while (i >= SHA256_BLOCK_SIZE) {
+ sha256_single_for_mh_sha256(input_data, digest);
+ input_data += SHA256_BLOCK_SIZE;
+ i -= SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - 8); j++)
+ buf[j] = 0;
+
+ if (i > SHA256_BLOCK_SIZE - 8)
+ i = 2 * SHA256_BLOCK_SIZE;
+ else
+ i = SHA256_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha256_single_for_mh_sha256(buf, digest);
+ if (i == (2 * SHA256_BLOCK_SIZE))
+ sha256_single_for_mh_sha256(buf + SHA256_BLOCK_SIZE, digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am b/src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am
new file mode 100644
index 000000000..a16209248
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/Makefile.am
@@ -0,0 +1,57 @@
+########################################################################
+# Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += rolling_hash/rolling_hashx_base.c
+lsrc_x86_64 += rolling_hash/rolling_hash2.c
+lsrc_x86_64 += rolling_hash/rolling_hash2_until_04.asm
+lsrc_x86_64 += rolling_hash/rolling_hash2_until_00.asm
+lsrc_x86_64 += rolling_hash/rolling_hash2_multibinary.asm
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_base_aliases += rolling_hash/rolling_hashx_base.c \
+ rolling_hash/rolling_hash2.c \
+ rolling_hash/rolling_hash2_base_aliases.c
+
+
+lsrc_aarch64 += rolling_hash/rolling_hashx_base.c \
+ rolling_hash/rolling_hash2.c \
+ rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S \
+ rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c \
+ rolling_hash/aarch64/rolling_hash2_run_until_unroll.S
+
+src_include += -I $(srcdir)/rolling_hash
+extern_hdrs += include/rolling_hashx.h
+
+other_src += rolling_hash/rolling_hash2_table.h
+other_src += include/test.h include/types.h
+
+check_tests += rolling_hash/rolling_hash2_test
+perf_tests += rolling_hash/rolling_hash2_perf
+other_tests += rolling_hash/chunking_with_mb_hash
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c
new file mode 100644
index 000000000..98692e162
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_dispatcher.c
@@ -0,0 +1,37 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(rolling_hash2_run_until)
+{
+ return PROVIDER_INFO(rolling_hash2_run_until_unroll);
+
+ //~ return PROVIDER_BASIC(rolling_hash2_run_until);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S
new file mode 100644
index 000000000..efbe44a18
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_aarch64_multibinary.S
@@ -0,0 +1,35 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface rolling_hash2_run_until
+
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S
new file mode 100644
index 000000000..7ba04efbd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/aarch64/rolling_hash2_run_until_unroll.S
@@ -0,0 +1,115 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+/*
+ uint64_t rolling_hash2_run_until_unroll(
+ uint32_t * idx, int max_idx, uint64_t * t1,
+ uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+ uint64_t mask, uint64_t trigger)
+*/
+ idx_addr .req x0
+ max_idx .req w1 //signed int
+ t1_addr .req x2
+ t2_addr .req x3
+ b1_addr .req x4
+ b2_addr .req x5
+ h .req x6
+ mask .req x7
+ trigger .req x12
+
+ idx .req w8
+
+ dat1 .req x9
+ dat2 .req x10
+ wdat1 .req w9
+ wdat2 .req w10
+ tmp_loop .req w11
+
+ t1 .req x13
+ t2 .req x14
+
+.macro round off:req
+ ubfx t1,dat1,8*\off\(),8
+ ubfx t2,dat2,8*\off\(),8
+ ldr t1,[t1_addr,t1,lsl 3]
+ ldr t2,[t2_addr,t2,lsl 3]
+ eor t1,t2,t1
+ eor h,t1,h,ror 63
+ and t2,h,mask
+ cmp t2,trigger
+ beq exit_ret
+ add idx, idx,1
+.endm
+ .global rolling_hash2_run_until_unroll
+ .type rolling_hash2_run_until_unroll, %function
+rolling_hash2_run_until_unroll:
+ ldr trigger,[sp]
+ ldr idx,[idx_addr]
+ sub tmp_loop,max_idx,8
+ cmp idx,tmp_loop
+ bge unroll_loop_end
+unroll_loop:
+ ldr dat1,[b1_addr,idx,sxtw]
+ ldr dat2,[b2_addr,idx,sxtw]
+
+ round 0
+ round 1
+ round 2
+ round 3
+ round 4
+ round 5
+ round 6
+ round 7
+ cmp tmp_loop,idx
+ bgt unroll_loop
+unroll_loop_end:
+ cmp idx,max_idx
+ bge exit_ret
+loop:
+ ldrb wdat1,[b1_addr,idx,sxtw]
+ ldrb wdat2,[b2_addr,idx,sxtw]
+ ldr t1,[t1_addr,dat1,lsl 3]
+ ldr t2,[t2_addr,dat2,lsl 3]
+ eor t1,t2,t1
+ eor h,t1,h,ror 63
+ and t2,h,mask
+ cmp t2,trigger
+ beq exit_ret
+ add idx,idx,1
+ cmp max_idx,idx
+ bgt loop
+exit_ret:
+ str idx,[idx_addr]
+ mov x0,h
+ ret
+ .size rolling_hash2_run_until_unroll, .-rolling_hash2_run_until_unroll
+
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c
new file mode 100644
index 000000000..23062c3ef
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/chunking_with_mb_hash.c
@@ -0,0 +1,222 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <assert.h>
+#include "rolling_hashx.h"
+#include "sha256_mb.h"
+#include "test.h"
+
+#define MAX_BUFFER_SIZE 128*1024*1024
+#define HASH_POOL_SIZE SHA256_MAX_LANES
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define FILTER_BITS 10
+#define FILTER_SIZE (1 << FILTER_BITS)
+#define FILTER_MASK (FILTER_SIZE - 1)
+
+#define BITS_TO_INDEX_LONG 6
+#define MASK_TO_INDEX_LONG ((1 << BITS_TO_INDEX_LONG) - 1)
+
+// Globals
+SHA256_HASH_CTX ctxpool[SHA256_MAX_LANES], *last_ctx;
+SHA256_HASH_CTX_MGR mb_hash_mgr;
+uint64_t filter_table[FILTER_SIZE];
+unsigned long chunks_created = 0;
+unsigned long filter_hits = 0;
+
+// Example function to run on each chunk
+
+void run_fragment(SHA256_HASH_CTX * ctx)
+{
+ uint64_t lookup, set_hash;
+ unsigned int lookup_hash;
+ uint32_t idx;
+
+ chunks_created++;
+
+ // Run a simple lookup filter on chunk using digest
+ lookup_hash = ctx->job.result_digest[0] & FILTER_MASK;
+ lookup = filter_table[lookup_hash];
+
+ idx = ctx->job.result_digest[1];
+
+ set_hash = 1 << (idx & MASK_TO_INDEX_LONG) |
+ 1 << ((idx >> BITS_TO_INDEX_LONG) & MASK_TO_INDEX_LONG) |
+ 1 << ((idx >> (2 * BITS_TO_INDEX_LONG)) & MASK_TO_INDEX_LONG);
+
+ if ((lookup & set_hash) == set_hash)
+ filter_hits++;
+ else
+ filter_table[lookup_hash] = lookup | set_hash;
+}
+
+void setup_chunk_processing(void)
+{
+ int i;
+
+ sha256_ctx_mgr_init(&mb_hash_mgr);
+
+ for (i = 0; i < HASH_POOL_SIZE; i++)
+ hash_ctx_init(&ctxpool[i]);
+
+ last_ctx = &ctxpool[0];
+}
+
+SHA256_HASH_CTX *get_next_job_ctx(void)
+{
+ int i;
+ SHA256_HASH_CTX *ctx;
+
+ if (last_ctx && hash_ctx_complete(last_ctx))
+ return last_ctx;
+
+ for (i = 0; i < HASH_POOL_SIZE; i++) {
+ if (hash_ctx_complete(&ctxpool[i]))
+ return &ctxpool[i];
+ }
+ ctx = sha256_ctx_mgr_flush(&mb_hash_mgr);
+ assert(ctx != NULL);
+ return ctx;
+}
+
+void put_next_job_ctx(SHA256_HASH_CTX * ctx)
+{
+ if (ctx && hash_ctx_complete(ctx))
+ last_ctx = ctx;
+
+ run_fragment(ctx);
+}
+
+void process_chunk(uint8_t * buff, int len)
+{
+ SHA256_HASH_CTX *ctx;
+
+ ctx = get_next_job_ctx();
+ ctx = sha256_ctx_mgr_submit(&mb_hash_mgr, ctx, buff, len, HASH_ENTIRE);
+
+ if (ctx)
+ put_next_job_ctx(ctx);
+}
+
+void finish_chunk_processing(void)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while ((ctx = sha256_ctx_mgr_flush(&mb_hash_mgr)) != NULL)
+ run_fragment(ctx);
+}
+
+int main(void)
+{
+ int i, w;
+ uint8_t *buffer, *p;
+ uint32_t mask, trigger, offset = 0;
+ uint32_t min_chunk, max_chunk, mean_chunk;
+ long remain;
+ struct rh_state2 state;
+ struct perf start, stop;
+
+ // Chunking parameters
+ w = 32;
+ min_chunk = 1024;
+ mean_chunk = 4 * 1024;
+ max_chunk = 32 * 1024;
+ mask = rolling_hashx_mask_gen(mean_chunk, 0);
+ trigger = rand() & mask;
+
+ printf("chunk and hash test w=%d, min=%d, target_ave=%d, max=%d:\n", w, min_chunk,
+ mean_chunk, max_chunk);
+
+ if (min_chunk < w || min_chunk > max_chunk) {
+ printf(" Improper parameters selected\n");
+ return -1;
+ }
+
+ if ((buffer = malloc(MAX_BUFFER_SIZE)) == NULL) {
+ printf("cannot allocate mem\n");
+ return -1;
+ }
+ // Initialize buffer with random data
+ srand(TEST_SEED);
+ for (i = 0; i < MAX_BUFFER_SIZE; i++)
+ buffer[i] = rand();
+
+ // Start chunking test with multi-buffer hashing of results
+ perf_start(&start);
+
+ rolling_hash2_init(&state, w);
+ setup_chunk_processing();
+
+ p = buffer;
+ remain = MAX_BUFFER_SIZE;
+
+ while (remain > max_chunk) {
+ // Skip to min chunk
+ rolling_hash2_reset(&state, p + min_chunk - w);
+ rolling_hash2_run(&state, p + min_chunk, max_chunk - min_chunk,
+ mask, trigger, &offset);
+
+ process_chunk(p, min_chunk + offset);
+
+ p += offset + min_chunk;
+ remain -= (offset + min_chunk);
+ }
+
+ while (remain > min_chunk) {
+ rolling_hash2_reset(&state, p + min_chunk - w);
+ rolling_hash2_run(&state, p + min_chunk, remain - min_chunk,
+ mask, trigger, &offset);
+
+ process_chunk(p, min_chunk + offset);
+
+ p += offset + min_chunk;
+ remain -= (offset + min_chunk);
+ }
+
+ if (remain > 0)
+ process_chunk(p, remain);
+
+ finish_chunk_processing();
+ perf_stop(&stop);
+
+ printf("chunking_with_mb_hash: ");
+ perf_print(stop, start, MAX_BUFFER_SIZE);
+
+ printf(" found %ld chunks, ave_len=%ld, filter hits=%ld\n", chunks_created,
+ MAX_BUFFER_SIZE / chunks_created, filter_hits);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c
new file mode 100644
index 000000000..4b066e40f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2.c
@@ -0,0 +1,169 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <string.h>
+#include "rolling_hashx.h"
+#include "rolling_hash2_table.h"
+
+extern
+uint64_t rolling_hash2_run_until(uint32_t * idx, int max_idx, uint64_t * t1,
+ uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+ uint64_t mask, uint64_t trigger);
+
+int rolling_hash2_init(struct rh_state2 *state, uint32_t w)
+{
+ uint32_t i;
+ uint64_t v;
+
+ if (w > FINGERPRINT_MAX_WINDOW)
+ return -1;
+
+ for (i = 0; i < 256; i++) {
+ v = rolling_hash2_table1[i];
+ state->table1[i] = v;
+ state->table2[i] = (v << w) | (v >> (64 - w));
+ }
+ state->w = w;
+ return 0;
+}
+
+void rolling_hash2_reset(struct rh_state2 *state, uint8_t * init_bytes)
+{
+ uint64_t hash;
+ uint32_t i, w;
+
+ hash = 0;
+ w = state->w;
+ for (i = 0; i < w; i++) {
+ hash = (hash << 1) | (hash >> (64 - 1));
+ hash ^= state->table1[init_bytes[i]];
+ }
+ state->hash = hash;
+ memcpy(state->history, init_bytes, w);
+}
+
+static
+uint64_t hash_fn(struct rh_state2 *state, uint64_t h, uint8_t new_char, uint8_t old_char)
+{
+ h = (h << 1) | (h >> (64 - 1));
+ h ^= state->table1[new_char] ^ state->table2[old_char];
+ return h;
+}
+
+uint64_t rolling_hash2_run_until_base(uint32_t * idx, int max_idx, uint64_t * t1,
+ uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+ uint64_t mask, uint64_t trigger)
+{
+ int i = *idx;
+
+ if (trigger == 0) {
+ for (; i < max_idx; i++) {
+ h = (h << 1) | (h >> (64 - 1));
+ h ^= t1[b1[i]] ^ t2[b2[i]];
+ if ((h & mask) == 0) {
+ *idx = i;
+ return h;
+ }
+ }
+ } else {
+ for (; i < max_idx; i++) {
+ h = (h << 1) | (h >> (64 - 1));
+ h ^= t1[b1[i]] ^ t2[b2[i]];
+ if ((h & mask) == trigger) {
+ *idx = i;
+ return h;
+ }
+ }
+ }
+ *idx = i;
+ return h;
+}
+
+int
+rolling_hash2_run(struct rh_state2 *state, uint8_t * buffer, uint32_t buffer_length,
+ uint32_t mask, uint32_t trigger, uint32_t * offset)
+{
+
+ uint32_t i;
+ uint32_t w = state->w;
+ uint64_t hash = state->hash;
+
+ for (i = 0; i < w; i++) {
+ if (i == buffer_length) {
+ *offset = i;
+ // update history
+ memmove(state->history, state->history + i, w - i);
+ memcpy(state->history + w - i, buffer, i);
+ state->hash = hash;
+ return FINGERPRINT_RET_MAX;
+ }
+ hash = hash_fn(state, hash, buffer[i], state->history[i]);
+
+ if ((hash & mask) == trigger) {
+ // found hit
+ i++;
+ *offset = i;
+ memmove(state->history, state->history + i, w - i);
+ memcpy(state->history + w - i, buffer, i);
+ state->hash = hash;
+ return FINGERPRINT_RET_HIT;
+ }
+ }
+
+ hash = rolling_hash2_run_until(&i, buffer_length, state->table1, state->table2,
+ buffer, buffer - w, hash, mask, trigger);
+ if ((hash & mask) == trigger) {
+ // found hit
+ i++;
+ *offset = i;
+ memcpy(state->history, buffer + i - w, w);
+ state->hash = hash;
+ return FINGERPRINT_RET_HIT;
+ }
+ // no hit
+ *offset = i;
+ memcpy(state->history, buffer + i - w, w);
+ state->hash = hash;
+ return FINGERPRINT_RET_MAX;
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver rolling_hash2_init_slver_00000264;
+struct slver rolling_hash2_init_slver = { 0x0264, 0x00, 0x00 };
+
+struct slver rolling_hash2_reset_slver_00000265;
+struct slver rolling_hash2_reset_slver = { 0x0265, 0x00, 0x00 };
+
+struct slver rolling_hash2_run_slver_00000266;
+struct slver rolling_hash2_run_slver = { 0x0266, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c
new file mode 100644
index 000000000..58ee50a92
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_base_aliases.c
@@ -0,0 +1,39 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+uint64_t rolling_hash2_run_until_base(uint32_t * idx, int max_idx, uint64_t * t1,
+ uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+ uint64_t mask, uint64_t trigger);
+uint64_t rolling_hash2_run_until(uint32_t * idx, int max_idx, uint64_t * t1,
+ uint64_t * t2, uint8_t * b1, uint8_t * b2, uint64_t h,
+ uint64_t mask, uint64_t trigger)
+{
+ return rolling_hash2_run_until_base(idx, max_idx, t1, t2, b1, b2, h, mask, trigger);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm
new file mode 100644
index 000000000..ad62dad74
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_multibinary.asm
@@ -0,0 +1,122 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+
+[bits 32]
+%define def_wrd dd
+%define wrd_sz dword
+%define arg1 esi
+
+%else
+
+default rel
+[bits 64]
+%define def_wrd dq
+%define wrd_sz qword
+%define arg1 rsi
+
+extern rolling_hash2_run_until_00
+extern rolling_hash2_run_until_04
+%endif
+
+extern rolling_hash2_run_until_base
+
+
+section .data
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+rolling_hash2_run_until_dispatched:
+ def_wrd rolling_hash2_run_until_mbinit
+
+section .text
+
+;;;;
+; rolling_hash2_run_until multibinary function
+;;;;
+mk_global rolling_hash2_run_until, function
+rolling_hash2_run_until_mbinit:
+ endbranch
+ call rolling_hash2_run_until_dispatch_init
+
+rolling_hash2_run_until:
+ jmp wrd_sz [rolling_hash2_run_until_dispatched]
+
+rolling_hash2_run_until_dispatch_init:
+ push arg1
+%ifidn __OUTPUT_FORMAT__, elf32 ;; 32-bit check
+ lea arg1, [rolling_hash2_run_until_base]
+%else
+ push rax
+ push rbx
+ push rcx
+ push rdx
+ lea arg1, [rolling_hash2_run_until_base WRT_OPT] ; Default
+
+ mov eax, 1
+ cpuid
+ lea rbx, [rolling_hash2_run_until_00 WRT_OPT]
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ cmovne arg1, rbx
+
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea rbx, [rolling_hash2_run_until_00 WRT_OPT]
+
+ jne _done_rolling_hash2_run_until_data_init
+ mov rsi, rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID1_EBX_AVX2
+ lea rbx, [rolling_hash2_run_until_04 WRT_OPT]
+ cmovne rsi, rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _done_rolling_hash2_run_until_data_init
+ lea rsi, [rolling_hash2_run_until_00 WRT_OPT]
+
+_done_rolling_hash2_run_until_data_init:
+ pop rdx
+ pop rcx
+ pop rbx
+ pop rax
+%endif ;; END 32-bit check
+ mov [rolling_hash2_run_until_dispatched], arg1
+ pop arg1
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c
new file mode 100644
index 000000000..da0e0fba7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_perf.c
@@ -0,0 +1,120 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include "rolling_hashx.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 100000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef FUT_run
+# define FUT_run rolling_hash2_run
+#endif
+#ifndef FUT_init
+# define FUT_init rolling_hash2_init
+#endif
+#ifndef FUT_reset
+# define FUT_reset rolling_hash2_reset
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x123f
+#endif
+
+#define TEST_MEM TEST_LEN
+
+int main(int argc, char *argv[])
+{
+ uint8_t *buf;
+ uint32_t mask, trigger, offset = 0;
+ int i, w, ret;
+ long long run_length;
+ struct rh_state2 *state;
+ struct perf start, stop;
+
+ // Case
+ w = 32;
+ mask = 0xffffffff;
+ trigger = 0x123;
+
+ printf(xstr(FUT_run) "_perf:\n");
+
+ buf = malloc(TEST_LEN);
+ if (buf == NULL) {
+ printf("alloc error: Fail\n");
+ return -1;
+ }
+ if (posix_memalign((void **)&state, 64, sizeof(struct rh_state2))) {
+ printf("alloc error rh_state: Fail\n");;
+ return -1;
+ }
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_LEN; i++)
+ buf[i] = rand();
+
+ printf("Start timed tests\n");
+ fflush(0);
+
+ FUT_init(state, w);
+ FUT_reset(state, buf);
+ ret = FUT_run(state, buf, TEST_LEN, mask, trigger, &offset);
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ ret = FUT_run(state, buf, TEST_LEN, mask, trigger, &offset);
+ }
+ perf_stop(&stop);
+
+ run_length = (ret == FINGERPRINT_RET_HIT) ? offset : TEST_LEN;
+ printf(" returned %d after %lld B\n", ret, run_length);
+ printf(xstr(FUT_run) TEST_TYPE_STR ": ");
+ perf_print(stop, start, run_length * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h
new file mode 100644
index 000000000..366f26374
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_table.h
@@ -0,0 +1,296 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _ROLLING_HASH2_TABLE_H_
+#define _ROLLING_HASH2_TABLE_H_
+
+// values are fractional part of pi
+// taken from:
+// http://www.herongyang.com/Cryptography/Blowfish-First-8366-Hex-Digits-of-PI.html
+// taken from source code of BlowfishJ
+
+uint64_t rolling_hash2_table1[256] = {
+ 0x243F6A8885A308D3,
+ 0x13198A2E03707344,
+ 0xA4093822299F31D0,
+ 0x082EFA98EC4E6C89,
+ 0x452821E638D01377,
+ 0xBE5466CF34E90C6C,
+ 0xC0AC29B7C97C50DD,
+ 0x3F84D5B5B5470917,
+ 0x9216D5D98979FB1B,
+ 0xD1310BA698DFB5AC,
+ 0x2FFD72DBD01ADFB7,
+ 0xB8E1AFED6A267E96,
+ 0xBA7C9045F12C7F99,
+ 0x24A19947B3916CF7,
+ 0x0801F2E2858EFC16,
+ 0x636920D871574E69,
+ 0xA458FEA3F4933D7E,
+ 0x0D95748F728EB658,
+ 0x718BCD5882154AEE,
+ 0x7B54A41DC25A59B5,
+ 0x9C30D5392AF26013,
+ 0xC5D1B023286085F0,
+ 0xCA417918B8DB38EF,
+ 0x8E79DCB0603A180E,
+ 0x6C9E0E8BB01E8A3E,
+ 0xD71577C1BD314B27,
+ 0x78AF2FDA55605C60,
+ 0xE65525F3AA55AB94,
+ 0x5748986263E81440,
+ 0x55CA396A2AAB10B6,
+ 0xB4CC5C341141E8CE,
+ 0xA15486AF7C72E993,
+ 0xB3EE1411636FBC2A,
+ 0x2BA9C55D741831F6,
+ 0xCE5C3E169B87931E,
+ 0xAFD6BA336C24CF5C,
+ 0x7A32538128958677,
+ 0x3B8F48986B4BB9AF,
+ 0xC4BFE81B66282193,
+ 0x61D809CCFB21A991,
+ 0x487CAC605DEC8032,
+ 0xEF845D5DE98575B1,
+ 0xDC262302EB651B88,
+ 0x23893E81D396ACC5,
+ 0x0F6D6FF383F44239,
+ 0x2E0B4482A4842004,
+ 0x69C8F04A9E1F9B5E,
+ 0x21C66842F6E96C9A,
+ 0x670C9C61ABD388F0,
+ 0x6A51A0D2D8542F68,
+ 0x960FA728AB5133A3,
+ 0x6EEF0B6C137A3BE4,
+ 0xBA3BF0507EFB2A98,
+ 0xA1F1651D39AF0176,
+ 0x66CA593E82430E88,
+ 0x8CEE8619456F9FB4,
+ 0x7D84A5C33B8B5EBE,
+ 0xE06F75D885C12073,
+ 0x401A449F56C16AA6,
+ 0x4ED3AA62363F7706,
+ 0x1BFEDF72429B023D,
+ 0x37D0D724D00A1248,
+ 0xDB0FEAD349F1C09B,
+ 0x075372C980991B7B,
+ 0x25D479D8F6E8DEF7,
+ 0xE3FE501AB6794C3B,
+ 0x976CE0BD04C006BA,
+ 0xC1A94FB6409F60C4,
+ 0x5E5C9EC2196A2463,
+ 0x68FB6FAF3E6C53B5,
+ 0x1339B2EB3B52EC6F,
+ 0x6DFC511F9B30952C,
+ 0xCC814544AF5EBD09,
+ 0xBEE3D004DE334AFD,
+ 0x660F2807192E4BB3,
+ 0xC0CBA85745C8740F,
+ 0xD20B5F39B9D3FBDB,
+ 0x5579C0BD1A60320A,
+ 0xD6A100C6402C7279,
+ 0x679F25FEFB1FA3CC,
+ 0x8EA5E9F8DB3222F8,
+ 0x3C7516DFFD616B15,
+ 0x2F501EC8AD0552AB,
+ 0x323DB5FAFD238760,
+ 0x53317B483E00DF82,
+ 0x9E5C57BBCA6F8CA0,
+ 0x1A87562EDF1769DB,
+ 0xD542A8F6287EFFC3,
+ 0xAC6732C68C4F5573,
+ 0x695B27B0BBCA58C8,
+ 0xE1FFA35DB8F011A0,
+ 0x10FA3D98FD2183B8,
+ 0x4AFCB56C2DD1D35B,
+ 0x9A53E479B6F84565,
+ 0xD28E49BC4BFB9790,
+ 0xE1DDF2DAA4CB7E33,
+ 0x62FB1341CEE4C6E8,
+ 0xEF20CADA36774C01,
+ 0xD07E9EFE2BF11FB4,
+ 0x95DBDA4DAE909198,
+ 0xEAAD8E716B93D5A0,
+ 0xD08ED1D0AFC725E0,
+ 0x8E3C5B2F8E7594B7,
+ 0x8FF6E2FBF2122B64,
+ 0x8888B812900DF01C,
+ 0x4FAD5EA0688FC31C,
+ 0xD1CFF191B3A8C1AD,
+ 0x2F2F2218BE0E1777,
+ 0xEA752DFE8B021FA1,
+ 0xE5A0CC0FB56F74E8,
+ 0x18ACF3D6CE89E299,
+ 0xB4A84FE0FD13E0B7,
+ 0x7CC43B81D2ADA8D9,
+ 0x165FA26680957705,
+ 0x93CC7314211A1477,
+ 0xE6AD206577B5FA86,
+ 0xC75442F5FB9D35CF,
+ 0xEBCDAF0C7B3E89A0,
+ 0xD6411BD3AE1E7E49,
+ 0x00250E2D2071B35E,
+ 0x226800BB57B8E0AF,
+ 0x2464369BF009B91E,
+ 0x5563911D59DFA6AA,
+ 0x78C14389D95A537F,
+ 0x207D5BA202E5B9C5,
+ 0x832603766295CFA9,
+ 0x11C819684E734A41,
+ 0xB3472DCA7B14A94A,
+ 0x1B5100529A532915,
+ 0xD60F573FBC9BC6E4,
+ 0x2B60A47681E67400,
+ 0x08BA6FB5571BE91F,
+ 0xF296EC6B2A0DD915,
+ 0xB6636521E7B9F9B6,
+ 0xFF34052EC5855664,
+ 0x53B02D5DA99F8FA1,
+ 0x08BA47996E85076A,
+ 0x4B7A70E9B5B32944,
+ 0xDB75092EC4192623,
+ 0xAD6EA6B049A7DF7D,
+ 0x9CEE60B88FEDB266,
+ 0xECAA8C71699A17FF,
+ 0x5664526CC2B19EE1,
+ 0x193602A575094C29,
+ 0xA0591340E4183A3E,
+ 0x3F54989A5B429D65,
+ 0x6B8FE4D699F73FD6,
+ 0xA1D29C07EFE830F5,
+ 0x4D2D38E6F0255DC1,
+ 0x4CDD20868470EB26,
+ 0x6382E9C6021ECC5E,
+ 0x09686B3F3EBAEFC9,
+ 0x3C9718146B6A70A1,
+ 0x687F358452A0E286,
+ 0xB79C5305AA500737,
+ 0x3E07841C7FDEAE5C,
+ 0x8E7D44EC5716F2B8,
+ 0xB03ADA37F0500C0D,
+ 0xF01C1F040200B3FF,
+ 0xAE0CF51A3CB574B2,
+ 0x25837A58DC0921BD,
+ 0xD19113F97CA92FF6,
+ 0x9432477322F54701,
+ 0x3AE5E58137C2DADC,
+ 0xC8B576349AF3DDA7,
+ 0xA94461460FD0030E,
+ 0xECC8C73EA4751E41,
+ 0xE238CD993BEA0E2F,
+ 0x3280BBA1183EB331,
+ 0x4E548B384F6DB908,
+ 0x6F420D03F60A04BF,
+ 0x2CB8129024977C79,
+ 0x5679B072BCAF89AF,
+ 0xDE9A771FD9930810,
+ 0xB38BAE12DCCF3F2E,
+ 0x5512721F2E6B7124,
+ 0x501ADDE69F84CD87,
+ 0x7A5847187408DA17,
+ 0xBC9F9ABCE94B7D8C,
+ 0xEC7AEC3ADB851DFA,
+ 0x63094366C464C3D2,
+ 0xEF1C18473215D908,
+ 0xDD433B3724C2BA16,
+ 0x12A14D432A65C451,
+ 0x50940002133AE4DD,
+ 0x71DFF89E10314E55,
+ 0x81AC77D65F11199B,
+ 0x043556F1D7A3C76B,
+ 0x3C11183B5924A509,
+ 0xF28FE6ED97F1FBFA,
+ 0x9EBABF2C1E153C6E,
+ 0x86E34570EAE96FB1,
+ 0x860E5E0A5A3E2AB3,
+ 0x771FE71C4E3D06FA,
+ 0x2965DCB999E71D0F,
+ 0x803E89D65266C825,
+ 0x2E4CC9789C10B36A,
+ 0xC6150EBA94E2EA78,
+ 0xA5FC3C531E0A2DF4,
+ 0xF2F74EA7361D2B3D,
+ 0x1939260F19C27960,
+ 0x5223A708F71312B6,
+ 0xEBADFE6EEAC31F66,
+ 0xE3BC4595A67BC883,
+ 0xB17F37D1018CFF28,
+ 0xC332DDEFBE6C5AA5,
+ 0x6558218568AB9802,
+ 0xEECEA50FDB2F953B,
+ 0x2AEF7DAD5B6E2F84,
+ 0x1521B62829076170,
+ 0xECDD4775619F1510,
+ 0x13CCA830EB61BD96,
+ 0x0334FE1EAA0363CF,
+ 0xB5735C904C70A239,
+ 0xD59E9E0BCBAADE14,
+ 0xEECC86BC60622CA7,
+ 0x9CAB5CABB2F3846E,
+ 0x648B1EAF19BDF0CA,
+ 0xA02369B9655ABB50,
+ 0x40685A323C2AB4B3,
+ 0x319EE9D5C021B8F7,
+ 0x9B540B19875FA099,
+ 0x95F7997E623D7DA8,
+ 0xF837889A97E32D77,
+ 0x11ED935F16681281,
+ 0x0E358829C7E61FD6,
+ 0x96DEDFA17858BA99,
+ 0x57F584A51B227263,
+ 0x9B83C3FF1AC24696,
+ 0xCDB30AEB532E3054,
+ 0x8FD948E46DBC3128,
+ 0x58EBF2EF34C6FFEA,
+ 0xFE28ED61EE7C3C73,
+ 0x5D4A14D9E864B7E3,
+ 0x42105D14203E13E0,
+ 0x45EEE2B6A3AAABEA,
+ 0xDB6C4F15FACB4FD0,
+ 0xC742F442EF6ABBB5,
+ 0x654F3B1D41CD2105,
+ 0xD81E799E86854DC7,
+ 0xE44B476A3D816250,
+ 0xCF62A1F25B8D2646,
+ 0xFC8883A0C1C7B6A3,
+ 0x7F1524C369CB7492,
+ 0x47848A0B5692B285,
+ 0x095BBF00AD19489D,
+ 0x1462B17423820E00,
+ 0x58428D2A0C55F5EA,
+ 0x1DADF43E233F7061,
+ 0x3372F0928D937E41,
+ 0xD65FECF16C223BDB,
+ 0x7CDE3759CBEE7460,
+ 0x4085F2A7CE77326E,
+ 0xA607808419F8509E,
+ 0xE8EFD85561D99735,
+ 0xA969A7AAC50C06C2,
+};
+#endif // _ROLLING_HASH2_TABLE_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c
new file mode 100644
index 000000000..ee45c120d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_test.c
@@ -0,0 +1,314 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stdio.h>
+#include "rolling_hashx.h"
+
+#ifndef FUT_run
+# define FUT_run rolling_hash2_run
+#endif
+#ifndef FUT_init
+# define FUT_init rolling_hash2_init
+#endif
+#ifndef FUT_reset
+# define FUT_reset rolling_hash2_reset
+#endif
+#ifndef FUT_ref
+# define FUT_ref rolling_hash2_ref
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define MAX_BUFFER_SIZE 128*1024*1024
+#define MAX_ROLLING_HASH_WIDTH 32
+
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static
+uint64_t rolling_hash2_ref(struct rh_state2 *state, unsigned char *p, int len,
+ uint64_t hash_init)
+{
+ int i;
+ uint64_t h = hash_init;
+
+ for (i = 0; i < len; i++) {
+ h = (h << 1) | (h >> (64 - 1));
+ h ^= state->table1[*p++];
+ }
+ return h;
+}
+
+int ones_in_mask(uint32_t in)
+{
+ int count;
+
+ for (count = 0; in != 0; in &= (in - 1))
+ count++;
+
+ return count;
+}
+
+/*
+ * Utility function to pick a random mask. Not uniform in number of bits.
+ */
+uint32_t pick_rand_mask_in_range(int min_bits, int max_bits)
+{
+ uint32_t mask = 0;
+ int ones;
+
+ do {
+ mask = rand();
+#if defined(_WIN32) || defined(_WIN64)
+ mask = (mask << 16) ^ rand();
+#endif
+ ones = ones_in_mask(mask);
+ } while (ones < min_bits || ones > max_bits);
+
+ return mask;
+}
+
+int main(void)
+{
+ uint8_t *buffer;
+ uint64_t hash;
+ uint32_t mask, trigger, offset = 0;
+ int i, w, r, ret, max, errors = 0;
+ uint32_t offset_fut;
+ struct rh_state2 state;
+
+ printf(xstr(FUT_run) ": " xstr(MAX_BUFFER_SIZE));
+
+ buffer = malloc(MAX_BUFFER_SIZE);
+ if (buffer == NULL) {
+ printf("cannot allocate mem\n");
+ return -1;
+ }
+ srand(TEST_SEED);
+
+ // Test case 1, compare trigger case at boundary with reference hash
+ w = 32;
+ mask = 0xffff0;
+ trigger = 0x3df0;
+ trigger &= mask;
+
+ for (i = 0; i < MAX_BUFFER_SIZE; i++)
+ buffer[i] = rand();
+
+ FUT_init(&state, w);
+ FUT_reset(&state, buffer);
+
+ uint8_t *p = buffer;
+ int remain = MAX_BUFFER_SIZE;
+ ret = FINGERPRINT_RET_HIT;
+
+ while ((ret == FINGERPRINT_RET_HIT) && (remain > 0)) {
+ ret = FUT_run(&state, p, remain, mask, trigger, &offset);
+
+ if (offset > remain) {
+ printf(" error offset past remaining limit\n");
+ errors++;
+ }
+
+ if ((ret == FINGERPRINT_RET_HIT) && (&p[offset] > &buffer[w])) {
+ hash = FUT_ref(&state, &p[offset] - w, w, 0);
+ if ((hash & mask) != trigger) {
+ printf(" mismatch chunk from ref");
+ printf(" hit: offset=%d %lx %lx\n", offset, state.hash, hash);
+ errors++;
+ }
+ }
+ p += offset;
+ remain -= offset;
+ putchar('.');
+ }
+
+ putchar('.'); // Finished test 1
+
+ // Test case 2, check if reference function hits same chunk boundary as test
+
+ w = 32;
+ mask = 0xffff;
+ trigger = rand();
+ trigger &= mask;
+ p = buffer;
+
+ // Function under test
+ FUT_init(&state, w);
+ FUT_reset(&state, p);
+ ret = FUT_run(&state, p + w, MAX_BUFFER_SIZE - w, mask, trigger, &offset_fut);
+ offset_fut += w;
+
+ // Reference
+ for (p++, offset = w + 1; offset < MAX_BUFFER_SIZE; offset++) {
+ hash = FUT_ref(&state, p++, w, 0);
+ if ((hash & mask) == trigger)
+ break;
+ }
+
+ if (offset != offset_fut) {
+ printf("\ncase 2, offset of chunk different from ref\n");
+ printf(" case 2: stop fut at offset=%d\n", offset_fut);
+ printf(" case 2: stop ref at offset=%d\n", offset);
+ errors++;
+ return errors;
+ }
+ putchar('.'); // Finished test 2
+
+ // Do case 2 above with random args
+
+ for (r = 0; r < RANDOMS; r++) {
+ w = rand() % MAX_ROLLING_HASH_WIDTH;
+ if (w < 3)
+ continue;
+
+ mask = pick_rand_mask_in_range(4, 20);
+ trigger = rand() & mask;
+ p = buffer;
+
+ // Function under test
+ FUT_init(&state, w);
+ FUT_reset(&state, p);
+ ret = FUT_run(&state, p + w, MAX_BUFFER_SIZE - w, mask, trigger, &offset_fut);
+ offset_fut += w;
+
+ // Reference
+ for (p++, offset = w + 1; offset < MAX_BUFFER_SIZE; offset++) {
+ hash = FUT_ref(&state, p++, w, 0);
+ if ((hash & mask) == trigger)
+ break;
+ }
+
+ if (offset != offset_fut) {
+ printf("\nrand case 2 #%d: w=%d, mask=0x%x, trigger=0x%x\n", r, w,
+ mask, trigger);
+ printf(" offset of chunk different from ref\n");
+ printf(" case 2r: stop fut at offset=%d\n", offset_fut);
+ printf(" case 2r: stop ref at offset=%d\n", offset);
+ errors++;
+ return errors;
+ }
+ putchar('.');
+ }
+
+ // Test case 3, check if max bound is same
+
+ w = 32;
+ mask = 0xfffff;
+ trigger = rand();
+ trigger &= mask;
+ putchar('|');
+
+ for (max = w + 1; max < 500; max++) {
+ p = buffer;
+ FUT_init(&state, w);
+ FUT_reset(&state, p);
+
+ ret = FUT_run(&state, p + w, max - w, mask, trigger, &offset_fut);
+ offset_fut += w;
+
+ int ret_ref = FINGERPRINT_RET_MAX;
+ for (p++, offset = w + 1; offset < max; offset++) {
+ hash = FUT_ref(&state, p++, w, 0);
+ if ((hash & mask) == trigger) {
+ ret_ref = FINGERPRINT_RET_HIT;
+ break;
+ }
+ }
+
+ if (offset != offset_fut || ret != ret_ref) {
+ printf("\ncase 3 max=%d, offset of chunk different from ref\n", max);
+ printf(" case 3: stop fut at offset=%d\n", offset_fut);
+ printf(" case 3: stop ref at offset=%d\n", offset);
+ printf(" case 3: ret_fut=%d ret_ref=%d\n", ret, ret_ref);
+ errors++;
+ return errors;
+ }
+ putchar('.'); // Finished test 3
+ }
+
+ // Test case 4, check if max bound is same under random params
+
+ for (r = 0; r < RANDOMS; r++) {
+ p = buffer;
+ mask = pick_rand_mask_in_range(24, 30); // Pick an unlikely mask
+ trigger = rand() & mask;
+ w = rand() % MAX_ROLLING_HASH_WIDTH;
+ max = rand() % 1024;
+
+ if (w < 3 || max < 2 * MAX_ROLLING_HASH_WIDTH)
+ continue;
+
+ FUT_init(&state, w);
+ FUT_reset(&state, p);
+
+ ret = FUT_run(&state, p, max, mask, trigger, &offset_fut);
+
+ if (offset_fut <= w)
+ continue;
+
+ int ret_ref = FINGERPRINT_RET_MAX;
+ for (p++, offset = w + 1; offset < max; offset++) {
+ hash = FUT_ref(&state, p++, w, 0);
+ if ((hash & mask) == trigger) {
+ ret_ref = FINGERPRINT_RET_HIT;
+ break;
+ }
+ }
+
+ if (offset != offset_fut || ret != ret_ref) {
+ printf("\ncase 4 rand case different from ref, max=%d w=%d\n", max, w);
+ printf(" case 4: stop fut at offset=%d\n", offset_fut);
+ printf(" case 4: stop ref at offset=%d\n", offset);
+ printf(" case 4: ret_fut=%d ret_ref=%d\n", ret, ret_ref);
+ errors++;
+ return errors;
+ }
+ putchar('.'); // Finished test 4
+
+ if (ret == FINGERPRINT_RET_HIT) {
+ p[-1] = rand(); // Keep hits from repeating
+ }
+ }
+
+ if (errors > 0)
+ printf(" Fail: %d\n", errors);
+ else
+ printf(" Pass\n");
+ return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm
new file mode 100644
index 000000000..99091faa4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_00.asm
@@ -0,0 +1,204 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; uint64_t rolling_hash2_run_until_00(uint32_t *idx, uint32_t buffer_length, uint64_t *t1,
+;;; uint64_t *t2, uint8_t *b1, uint8_t *b2, uint64_t h, uint64_t mask,
+;;; uint64_t trigger)
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define arg6 r10
+ %define arg7 r11
+ %define arg8 r12 ; must be saved and loaded
+ %define tmp1 rbp ; must be saved and loaded
+ %define tmp2 rbx ; must be saved and loaded
+ %define tmp3 r13 ; must be saved and loaded
+ %define tmp4 r14 ; must be saved and loaded
+ %define tmp5 r15 ; must be saved and loaded
+ %define return rax
+ %define PS 8
+ %define frame_size 6*8
+ %define arg(x) [rsp + frame_size + PS + PS*x]
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push rbp
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ mov arg6, arg(0)
+ mov arg7, arg(1)
+ mov arg8, arg(2)
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ pop rbp
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12 ; must be saved and loaded
+ %define arg5 r13 ; must be saved and loaded
+ %define arg6 r14 ; must be saved and loaded
+ %define arg7 r15 ; must be saved and loaded
+ %define arg8 rbx ; must be saved and loaded
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 rdi ; must be saved and loaded
+ %define tmp4 rsi ; must be saved and loaded
+ %define tmp5 rbp ; must be saved and loaded
+ %define return rax
+ %define PS 8
+ %define frame_size 8*8
+ %define arg(x) [rsp + frame_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ push_reg r12
+ push_reg r13
+ push_reg r14
+ push_reg r15
+ push_reg rbx
+ push_reg rdi
+ push_reg rsi
+ push_reg rbp
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ mov arg6, arg(6)
+ mov arg7, arg(7)
+ mov arg8, arg(8)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rsi
+ pop rdi
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%endif
+
+%define idx arg0
+%define max arg1
+%define t1 arg2
+%define t2 arg3
+%define b1 arg4
+%define b2 arg5
+%define hash arg6
+%define mask arg7
+%define trigger arg8
+
+%define pos rax
+%define pos.w eax
+%define x tmp2
+%define y tmp3
+%define z tmp4
+%define h tmp1
+%define a tmp5
+
+default rel
+[bits 64]
+section .text
+
+align 16
+mk_global rolling_hash2_run_until_00, function
+func(rolling_hash2_run_until_00)
+ endbranch
+ FUNC_SAVE
+ mov pos.w, dword [idx]
+ sub max, 2
+ cmp pos, max
+ jg .less_than_2
+
+.loop2: ror hash, 0x3f
+ movzx x, byte [b1 + pos]
+ movzx a, byte [b1 + pos + 1]
+ movzx y, byte [b2 + pos]
+ movzx h, byte [b2 + pos + 1]
+ mov z, [t1 + x * 8]
+ xor z, [t2 + y * 8]
+ xor hash, z
+ mov x, hash
+ and x, mask
+ cmp x, trigger
+ je .ret_0
+
+ ror hash, 0x3f
+ mov z, [t1 + a * 8]
+ xor z, [t2 + h * 8]
+ xor hash, z
+ mov y, hash
+ and y, mask
+ cmp y, trigger
+ je .ret_1
+
+ add pos, 2
+ cmp pos, max
+ jle .loop2
+
+.less_than_2:
+ add max, 1
+ cmp pos, max
+ jg .ret_0
+ ror hash, 0x3f
+ movzx x, byte [b1 + pos]
+ movzx y, byte [b2 + pos]
+ mov z, [t1 + x * 8]
+ xor z, [t2 + y * 8]
+ xor hash, z
+.ret_1: add pos, 1
+.ret_0: mov dword [idx], pos.w
+ mov rax, hash
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm
new file mode 100644
index 000000000..3f4e8353b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hash2_until_04.asm
@@ -0,0 +1,203 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; uint64_t rolling_hash2_run_until_04(uint32_t *idx, uint32_t max_idx, uint64_t *t1,
+;;; uint64_t *t2, uint8_t *b1, uint8_t *b2, uint64_t h, uint64_t mask,
+;;; uint64_t trigger)
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+ %define arg4 r8
+ %define arg5 r9
+
+ %define arg6 r10
+ %define arg7 r11
+ %define arg8 r12 ; must be saved and loaded
+ %define tmp1 rbp ; must be saved and loaded
+ %define tmp2 rbx ; must be saved and loaded
+ %define tmp3 r13 ; must be saved and loaded
+ %define tmp4 r14 ; must be saved and loaded
+ %define tmp5 r15 ; must be saved and loaded
+ %define return rax
+ %define PS 8
+ %define frame_size 6*8
+ %define arg(x) [rsp + frame_size + PS + PS*x]
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push rbp
+ push rbx
+ push r12
+ push r13
+ push r14
+ push r15
+ mov arg6, arg(0)
+ mov arg7, arg(1)
+ mov arg8, arg(2)
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ pop rbx
+ pop rbp
+ %endmacro
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+ %define arg4 r12 ; must be saved and loaded
+ %define arg5 r13 ; must be saved and loaded
+ %define arg6 r14 ; must be saved and loaded
+ %define arg7 r15 ; must be saved and loaded
+ %define arg8 rbx ; must be saved and loaded
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 rdi ; must be saved and loaded
+ %define tmp4 rsi ; must be saved and loaded
+ %define tmp5 rbp ; must be saved and loaded
+ %define return rax
+ %define PS 8
+ %define frame_size 8*8
+ %define arg(x) [rsp + frame_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ push_reg r12
+ push_reg r13
+ push_reg r14
+ push_reg r15
+ push_reg rbx
+ push_reg rdi
+ push_reg rsi
+ push_reg rbp
+ end_prolog
+ mov arg4, arg(4)
+ mov arg5, arg(5)
+ mov arg6, arg(6)
+ mov arg7, arg(7)
+ mov arg8, arg(8)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rsi
+ pop rdi
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%endif
+
+%define idx arg0
+%define max arg1
+%define t1 arg2
+%define t2 arg3
+%define b1 arg4
+%define b2 arg5
+%define hash arg6
+%define mask arg7
+%define trigger arg8
+
+%define pos rax
+%define pos.w eax
+%define x tmp2
+%define y tmp3
+%define z tmp4
+%define h tmp1
+%define a tmp5
+
+default rel
+[bits 64]
+section .text
+
+align 16
+mk_global rolling_hash2_run_until_04, function
+func(rolling_hash2_run_until_04)
+ endbranch
+ FUNC_SAVE
+ mov pos.w, dword [idx]
+ pext trigger, trigger, mask
+ sub max, 2
+ cmp pos, max
+ jg .less_than_2
+
+.loop2: rorx hash, hash, 0x3f
+ movzx x, byte [b1 + pos]
+ movzx a, byte [b1 + pos + 1]
+ movzx y, byte [b2 + pos]
+ movzx h, byte [b2 + pos + 1]
+ mov z, [t1 + x * 8]
+ xor z, [t2 + y * 8]
+ xor hash, z
+ pext x, hash, mask
+ cmp x, trigger
+ je .ret_0
+
+ rorx hash, hash, 0x3f
+ mov z, [t1 + a * 8]
+ xor z, [t2 + h * 8]
+ xor hash, z
+ pext y, hash, mask
+ cmp y, trigger
+ je .ret_1
+
+ add pos, 2
+ cmp pos, max
+ jle .loop2
+
+.less_than_2:
+ add max, 1
+ cmp pos, max
+ jg .ret_0
+ rorx hash, hash, 0x3f
+ movzx x, byte [b1 + pos]
+ movzx y, byte [b2 + pos]
+ mov z, [t1 + x * 8]
+ xor z, [t2 + y * 8]
+ xor hash, z
+.ret_1: add pos, 1
+.ret_0: mov dword [idx], pos.w
+ mov rax, hash
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data
diff --git a/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c
new file mode 100644
index 000000000..4197def0e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/rolling_hash/rolling_hashx_base.c
@@ -0,0 +1,65 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+inline int floor_pow2(uint32_t in)
+{
+ uint32_t x = in;
+
+ while (in) {
+ x = in;
+ in &= (in - 1);
+ }
+ return x;
+}
+
+inline uint32_t rol(uint32_t x, int i)
+{
+ return x << i | x >> (8 * sizeof(x) - i);
+}
+
+uint32_t rolling_hashx_mask_gen(long mean, int shift)
+{
+ if (mean <= 2)
+ mean = 2;
+
+ return rol(floor_pow2(mean) - 1, shift);
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver rolling_hashx_mask_gen_slver_00000260;
+struct slver rolling_hashx_mask_gen_slver = { 0x0260, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am
new file mode 100644
index 000000000..3f3c589ad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am
@@ -0,0 +1,130 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += sha1_mb/sha1_ctx_sse.c \
+ sha1_mb/sha1_ctx_avx.c \
+ sha1_mb/sha1_ctx_avx2.c \
+ sha1_mb/sha1_ctx_base.c
+
+lsrc_x86_64 += sha1_mb/sha1_mb_mgr_init_sse.c \
+ sha1_mb/sha1_mb_mgr_init_avx2.c
+
+lsrc_x86_64 += sha1_mb/sha1_mb_mgr_submit_sse.asm \
+ sha1_mb/sha1_mb_mgr_submit_avx.asm \
+ sha1_mb/sha1_mb_mgr_submit_avx2.asm \
+ sha1_mb/sha1_mb_mgr_flush_sse.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx2.asm \
+ sha1_mb/sha1_mb_x4_sse.asm \
+ sha1_mb/sha1_mb_x4_avx.asm \
+ sha1_mb/sha1_mb_x8_avx2.asm \
+ sha1_mb/sha1_multibinary.asm
+
+lsrc_x86_64 += sha1_mb/sha1_ctx_avx512.c \
+ sha1_mb/sha1_mb_mgr_init_avx512.c \
+ sha1_mb/sha1_mb_mgr_submit_avx512.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx512.asm \
+ sha1_mb/sha1_mb_x16_avx512.asm
+
+lsrc_x86_64 += sha1_mb/sha1_opt_x1.asm
+
+lsrc_x86_64 += sha1_mb/sha1_ni_x1.asm \
+ sha1_mb/sha1_ni_x2.asm \
+ sha1_mb/sha1_ctx_sse_ni.c \
+ sha1_mb/sha1_ctx_avx512_ni.c \
+ sha1_mb/sha1_mb_mgr_submit_sse_ni.asm \
+ sha1_mb/sha1_mb_mgr_flush_sse_ni.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += sha1_mb/sha1_ctx_base.c \
+ sha1_mb/sha1_ref.c \
+ sha1_mb/aarch64/sha1_mb_multibinary.S \
+ sha1_mb/aarch64/sha1_ctx_ce.c \
+ sha1_mb/aarch64/sha1_mb_x1_ce.S \
+ sha1_mb/aarch64/sha1_mb_x2_ce.S \
+ sha1_mb/aarch64/sha1_mb_mgr_ce.c \
+ sha1_mb/aarch64/sha1_ctx_asimd.c \
+ sha1_mb/aarch64/sha1_aarch64_x1.S \
+ sha1_mb/aarch64/sha1_mb_asimd_x4.S \
+ sha1_mb/aarch64/sha1_mb_mgr_asimd.c \
+ sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c
+
+
+
+lsrc_base_aliases += sha1_mb/sha1_ctx_base_aliases.c \
+ sha1_mb/sha1_ctx_base.c \
+ sha1_mb/sha1_ref.c
+
+src_include += -I $(srcdir)/sha1_mb
+
+extern_hdrs += include/sha1_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ include/multibinary.asm \
+ sha1_mb/sha1_job.asm \
+ sha1_mb/sha1_mb_mgr_datastruct.asm \
+ include/reg_sizes.asm \
+ sha1_mb/sha1_ref.c \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h
+
+check_tests += sha1_mb/sha1_mb_test \
+ sha1_mb/sha1_mb_rand_test \
+ sha1_mb/sha1_mb_rand_update_test \
+ sha1_mb/sha1_mb_flush_test
+
+unit_tests += sha1_mb/sha1_mb_rand_ssl_test
+
+perf_tests += sha1_mb/sha1_mb_vs_ossl_perf \
+ sha1_mb/sha1_mb_vs_ossl_shortage_perf
+
+examples += sha1_mb/sha1_multi_buffer_example
+
+
+sha1_mb_rand_test: sha1_ref.o
+sha1_mb_sha1_mb_rand_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_rand_update_test: sha1_ref.o
+sha1_mb_sha1_mb_rand_update_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_flush_test: sha1_ref.o
+sha1_mb_sha1_mb_flush_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha1_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sha1_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S
new file mode 100644
index 000000000..55d6f932f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_aarch64_x1.S
@@ -0,0 +1,294 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+ input_data .req x0
+ num_blocks .req w1
+ digest .req x2
+
+ // x2 is reused intentionally between digest/tmp
+ // due to running out of registers
+ TMP .req x2
+ TMPW .req w2
+ sha1key_adr .req x3
+ WK .req w3
+ WF .req w4
+ WA .req w5
+ WB .req w6
+ WC .req w7
+ WD .req w8
+ WE .req w9
+ WORD0 .req w10
+ WORD1 .req w11
+ WORD2 .req w12
+ WORD3 .req w13
+ WORD4 .req w14
+ WORD5 .req w15
+ WORD6 .req w16
+ WORD7 .req w17
+ WORD8 .req w18
+ WORD9 .req w19
+ WORD10 .req w20
+ WORD11 .req w21
+ WORD12 .req w22
+ WORD13 .req w23
+ WORD14 .req w24
+ WORD15 .req w25
+ AA .req w26
+ BB .req w27
+ CC .req w28
+ DD .req w29
+ EE .req w30
+
+ TT .req w0
+
+.macro save_stack
+ stp x16,x17,[sp, -128]!
+ stp x18,x19,[sp, 16]
+ stp x20,x21,[sp, 32]
+ stp x22,x23,[sp, 48]
+ stp x24,x25,[sp, 64]
+ stp x26,x27,[sp, 80]
+ stp x28,x29,[sp, 96]
+ str x30,[sp, 112]
+ // have to reuse x2, which is digest address
+ str x2,[sp, 120]
+.endm
+
+.macro restore_stack
+ ldp x18,x19,[sp, 16]
+ ldp x20,x21,[sp, 32]
+ ldp x22,x23,[sp, 48]
+ ldp x24,x25,[sp, 64]
+ ldp x26,x27,[sp, 80]
+ ldp x28,x29,[sp, 96]
+ ldr x30,[sp, 112]
+ ldr x2,[sp, 120]
+ ldp x16,x17,[sp],128
+.endm
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+ eor WF, WC, WD
+ and WF, WB, WF
+ eor WF, WD, WF
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+ eor WF, WB, WC
+ eor WF, WF, WD
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+ and TMPW, WB, WC
+ and WF, WB, WD
+ orr WF, WF, TMPW
+ and TMPW, WC, WD
+ orr WF, WF, TMPW
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+ FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+ .if \windex < 16
+ load_word_at \windex
+ .endif
+.endm
+
+.macro SHA1_STEP_00_15 windex:req
+ rev WORD\windex\(),WORD\windex\()
+ next_word=\windex+1
+ load_next_word %next_word
+
+ ror TMPW,WA,#32-5
+ add WE,WE,TMPW
+ add WE,WE,WK
+ FUNC_F0
+ ror WB,WB,#32-30
+ add WE,WE,WORD\windex\()
+ add WE,WE,WF
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+ eor TMPW,\reg_14,\reg_8
+ eor \reg_16,\reg_16,\reg_3
+ eor \reg_16,\reg_16,TMPW
+
+ ror TMPW,WA,#32-5
+ ror \reg_16,\reg_16, #32 - 1
+
+ add WE,WE,TMPW
+ add WE,WE,WK
+ \func_f
+ ror WB,WB,#32-30
+ add WE,WE,\reg_16
+ add WE,WE,WF
+.endm
+
+.macro SWAP_STATES
+ .unreq TT
+ TT .req WE
+ .unreq WE
+ WE .req WD
+ .unreq WD
+ WD .req WC
+ .unreq WC
+ WC .req WB
+ .unreq WB
+ WB .req WA
+ .unreq WA
+ WA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+ SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+ .if \windex <= 15
+ SHA1_STEP_00_15 windex
+ .else
+ idx14=((\windex - 14) & 15)
+ idx8=((\windex - 8) & 15)
+ idx3=((\windex - 3) & 15)
+ idx16=(\windex & 15)
+ .if \windex <= 19
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 20 && \windex <= 39
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 40 && \windex <= 59
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 60 && \windex <= 79
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .endif
+
+ SWAP_STATES
+.endm
+
+.macro exec_steps idx:req,more:vararg
+ exec_step \idx
+ .ifnb \more
+ exec_steps \more
+ .endif
+.endm
+
+.altmacro
+
+.macro load_two_words_at idx0:req,idx1:req
+ ldp WORD\idx0\(),WORD\idx1\(),[input_data],8
+.endm
+
+.macro load_word_at idx:req
+ .if \idx % 2 == 0
+ idx1=\idx+1
+ load_two_words_at \idx,%idx1
+ .endif
+.endm
+
+/*
+ * void sha1_aarch64_x1(uint32_t *input_data, int num_blocks, uint32_t digest[5])
+ */
+ .global sha1_aarch64_x1
+ .type sha1_aarch64_x1, %function
+sha1_aarch64_x1:
+ cmp num_blocks, #0
+ beq .return
+
+ ldp WA,WB,[digest]
+ ldp WC,WD,[digest,8]
+ ldr WE,[digest,16]
+ save_stack
+
+.block_loop:
+ mov AA, WA
+ mov BB, WB
+ mov CC, WC
+ mov DD, WD
+ mov EE, WE
+
+ load_word_at 0
+
+ adr sha1key_adr, KEY_0
+ ldr WK, [sha1key_adr]
+ exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+ // 20 ~ 39
+ adr sha1key_adr, KEY_1
+ ldr WK, [sha1key_adr]
+ exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+ // 40 ~ 59
+ adr sha1key_adr, KEY_2
+ ldr WK, [sha1key_adr]
+ exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+ // 60 ~ 79
+ adr sha1key_adr, KEY_3
+ ldr WK, [sha1key_adr]
+ exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+ add WA, AA, WA
+ add WB, BB, WB
+ add WC, CC, WC
+ add WD, DD, WD
+ add WE, EE, WE
+
+ subs num_blocks, num_blocks, 1
+ bne .block_loop
+
+ restore_stack
+ stp WA,WB,[digest]
+ stp WC,WD,[digest,8]
+ str WE,[digest,16]
+
+.return:
+ ret
+
+ .size sha1_aarch64_x1, .-sha1_aarch64_x1
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+KEY_0:
+ .word 0x5a827999
+KEY_1:
+ .word 0x6ed9eba1
+KEY_2:
+ .word 0x8f1bbcdc
+KEY_3:
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S
new file mode 100644
index 000000000..c8b8dd982
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_asimd_common.S
@@ -0,0 +1,269 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+// macro F = (D ^ (B & (C ^ D)))
+.macro FUNC_F0
+ eor VF.16b, VC.16b, VD.16b
+ and VF.16b, VB.16b, VF.16b
+ eor VF.16b, VD.16b, VF.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F1
+ eor VF.16b, VB.16b, VC.16b
+ eor VF.16b, VF.16b, VD.16b
+.endm
+
+// F = ((B & C) | (B & D) | (C & D))
+.macro FUNC_F2
+ and vT0.16b, VB.16b, VC.16b
+ and vT1.16b, VB.16b, VD.16b
+ and vT2.16b, VC.16b, VD.16b
+ orr VF.16b, vT0.16b, vT1.16b
+ orr VF.16b, VF.16b, vT2.16b
+.endm
+
+// F = (B ^ C ^ D)
+.macro FUNC_F3
+ FUNC_F1
+.endm
+
+.altmacro
+.macro load_next_word windex
+ .if \windex < 16
+ load_x4_word \windex
+ .endif
+.endm
+
+// FUNC_F0 is merged into STEP_00_15 for efficiency
+.macro SHA1_STEP_00_15_F0 windex:req
+ rev32 WORD\windex\().16b,WORD\windex\().16b
+ next_word=\windex+1
+ load_next_word %next_word
+ // e = (a leftrotate 5) + f + e + k + w[i]
+ ushr VT.4s, VA.4s, 32 - 5
+ add VE.4s, VE.4s, VK.4s
+ sli VT.4s, VA.4s, 5
+ eor VF.16b, VC.16b, VD.16b
+ add VE.4s, VE.4s, WORD\windex\().4s
+ and VF.16b, VB.16b, VF.16b
+ add VE.4s, VE.4s, VT.4s
+ eor VF.16b, VD.16b, VF.16b
+ ushr VT.4s, VB.4s, 32 - 30
+ add VE.4s, VE.4s, VF.4s
+ sli VT.4s, VB.4s, 30
+.endm
+
+.macro SHA1_STEP_16_79 windex:req,func_f:req,reg_3:req,reg_8:req,reg_14:req,reg_16:req
+ eor vT0.16b,\reg_3\().16b,\reg_8\().16b
+ eor VT.16b,\reg_14\().16b,\reg_16\().16b
+ eor vT0.16b,vT0.16b,VT.16b
+ // e = (a leftrotate 5) + f + e + k + w[i]
+ ushr VT.4s, vT0.4s, 32 - 1
+ add VE.4s, VE.4s, VK.4s
+ ushr vT1.4s, VA.4s, 32 - 5
+ sli VT.4s, vT0.4s, 1
+ add VE.4s, VE.4s, VT.4s
+ sli vT1.4s, VA.4s, 5
+ mov \reg_16\().16b,VT.16b
+ add VE.4s, VE.4s, vT1.4s
+ ushr VT.4s, VB.4s, 32 - 30
+ \func_f
+ add VE.4s, VE.4s, VF.4s
+ sli VT.4s, VB.4s, 30
+.endm
+
+ VA .req v0
+ VB .req v1
+ VC .req v2
+ VD .req v3
+ VE .req v4
+ VT .req v5
+ VF .req v6
+ VK .req v7
+ WORD0 .req v8
+ WORD1 .req v9
+ WORD2 .req v10
+ WORD3 .req v11
+ WORD4 .req v12
+ WORD5 .req v13
+ WORD6 .req v14
+ WORD7 .req v15
+ WORD8 .req v16
+ WORD9 .req v17
+ WORD10 .req v18
+ WORD11 .req v19
+ WORD12 .req v20
+ WORD13 .req v21
+ WORD14 .req v22
+ WORD15 .req v23
+ vT0 .req v24
+ vT1 .req v25
+ vT2 .req v26
+ vAA .req v27
+ vBB .req v28
+ vCC .req v29
+ vDD .req v30
+ vEE .req v31
+ TT .req v0
+ sha1key_adr .req x15
+
+.macro SWAP_STATES
+ // shifted VB is held in VT after each step
+ .unreq TT
+ TT .req VE
+ .unreq VE
+ VE .req VD
+ .unreq VD
+ VD .req VC
+ .unreq VC
+ VC .req VT
+ .unreq VT
+ VT .req VB
+ .unreq VB
+ VB .req VA
+ .unreq VA
+ VA .req TT
+.endm
+
+.altmacro
+.macro SHA1_STEP_16_79_WRAPPER windex:req,func_f:req,idx3:req,idx8:req,idx14:req,idx16:req
+ SHA1_STEP_16_79 \windex,\func_f,WORD\idx3\(),WORD\idx8\(),WORD\idx14\(),WORD\idx16\()
+.endm
+
+.macro exec_step windex:req
+ .if \windex <= 15
+ SHA1_STEP_00_15_F0 windex
+ .else
+ idx14=((\windex - 14) & 15)
+ idx8=((\windex - 8) & 15)
+ idx3=((\windex - 3) & 15)
+ idx16=(\windex & 15)
+ .if \windex <= 19
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F0,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 20 && \windex <= 39
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F1,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 40 && \windex <= 59
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F2,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .if \windex >= 60 && \windex <= 79
+ SHA1_STEP_16_79_WRAPPER \windex,FUNC_F3,%idx3,%idx8,%idx14,%idx16
+ .endif
+ .endif
+
+ SWAP_STATES
+
+ .if \windex == 79
+ // after 80 steps, the registers ABCDET has shifted from
+ // its orignal order of 012345 to 341520
+ // have to swap back for both compile- and run-time correctness
+ mov v0.16b,v3.16b
+ .unreq VA
+ VA .req v0
+
+ mov vT0.16b,v2.16b
+ mov v2.16b,v1.16b
+ mov v1.16b,v4.16b
+ .unreq VB
+ VB .req v1
+ .unreq VC
+ VC .req v2
+
+ mov v3.16b,v5.16b
+ .unreq VD
+ VD .req v3
+
+ mov v4.16b,vT0.16b
+ .unreq VE
+ VE .req v4
+
+ .unreq VT
+ VT .req v5
+ .endif
+.endm
+
+.macro exec_steps idx:req,more:vararg
+ exec_step \idx
+ .ifnb \more
+ exec_steps \more
+ .endif
+.endm
+
+.macro sha1_single
+ load_x4_word 0
+
+ mov vAA.16B, VA.16B
+ mov vBB.16B, VB.16B
+ mov vCC.16B, VC.16B
+ mov vDD.16B, VD.16B
+ mov vEE.16B, VE.16B
+
+ adr sha1key_adr, KEY_0
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+
+ // 20 ~ 39
+ adr sha1key_adr, KEY_1
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+
+ // 40 ~ 59
+ adr sha1key_adr, KEY_2
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+
+ // 60 ~ 79
+ adr sha1key_adr, KEY_3
+ ld1 {VK.4s}, [sha1key_adr]
+ exec_steps 60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79
+
+ add VA.4s, vAA.4s, VA.4s
+ add VB.4s, vBB.4s, VB.4s
+ add VC.4s, vCC.4s, VC.4s
+ add VD.4s, vDD.4s, VD.4s
+ add VE.4s, vEE.4s, VE.4s
+.endm
+
+.macro sha1_asimd_save_stack
+ stp d8,d9,[sp, -64]!
+ stp d10,d11,[sp, 16]
+ stp d12,d13,[sp, 32]
+ stp d14,d15,[sp, 48]
+.endm
+
+.macro sha1_asimd_restore_stack
+ ldp d10,d11,[sp, 16]
+ ldp d12,d13,[sp, 32]
+ ldp d14,d15,[sp, 48]
+ ldp d8,d9,[sp],64
+.endm
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c
new file mode 100644
index 000000000..9a9952ff6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_asimd.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state);
+SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job);
+SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_asimd(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_asimd(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_asimd(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_asimd(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_asimd(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_asimd_slver_02020142;
+struct slver sha1_ctx_mgr_init_asimd_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_asimd_slver_02020143;
+struct slver sha1_ctx_mgr_submit_asimd_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_asimd_slver_02020144;
+struct slver sha1_ctx_mgr_flush_asimd_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c
new file mode 100644
index 000000000..e40a344ff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_ctx_ce.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state);
+SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job);
+SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_ce(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_ce(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_ce(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_ce(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_ce(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_ce_slver_02020142;
+struct slver sha1_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_ce_slver_02020143;
+struct slver sha1_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_ce_slver_02020144;
+struct slver sha1_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..0942c1a95
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_aarch64_dispatcher.c
@@ -0,0 +1,93 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_submit)
+{
+
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(sha1_ctx_mgr_submit_ce);
+
+ if (auxval & HWCAP_ASIMD) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through
+ return PROVIDER_INFO(sha1_ctx_mgr_submit_asimd);
+ default:
+ break;
+ }
+ }
+
+ return PROVIDER_BASIC(sha1_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_init)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(sha1_ctx_mgr_init_ce);
+
+ if (auxval & HWCAP_ASIMD) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through
+ return PROVIDER_INFO(sha1_ctx_mgr_init_asimd);
+ default:
+ break;
+ }
+ }
+
+ return PROVIDER_BASIC(sha1_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha1_ctx_mgr_flush)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA1)
+ return PROVIDER_INFO(sha1_ctx_mgr_flush_ce);
+
+ if (auxval & HWCAP_ASIMD) {
+ switch (get_micro_arch_id()) {
+ case MICRO_ARCH_ID(ARM, NEOVERSE_N1): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A57): // fall through
+ case MICRO_ARCH_ID(ARM, CORTEX_A72): // fall through
+ return PROVIDER_INFO(sha1_ctx_mgr_flush_asimd);
+ default:
+ break;
+ }
+ }
+
+ return PROVIDER_BASIC(sha1_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S
new file mode 100644
index 000000000..012b15c14
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_asimd_x4.S
@@ -0,0 +1,192 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+ .arch armv8-a
+
+#include "sha1_asimd_common.S"
+
+.macro internal_load windex
+ // load 64-bytes from each address to maximize usage of cache line
+ .if \windex == 0
+ mov tmp,dataptr
+ ld1 {WORD0.4s},[data0],16
+ ld1 {WORD4.4s},[data0],16
+ ld1 {WORD8.4s},[data0],16
+ ld1 {WORD12.4s},[data0],16
+
+ ld1 {WORD1.4s},[data1],16
+ ld1 {WORD5.4s},[data1],16
+ ld1 {WORD9.4s},[data1],16
+ ld1 {WORD13.4s},[data1],16
+
+ ld1 {WORD2.4s},[data2],16
+ ld1 {WORD6.4s},[data2],16
+ ld1 {WORD10.4s},[data2],16
+ ld1 {WORD14.4s},[data2],16
+
+ ld1 {WORD3.4s},[data3],16
+ ld1 {WORD7.4s},[data3],16
+ ld1 {WORD11.4s},[data3],16
+ ld1 {WORD15.4s},[data3],16
+
+ st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[0],[tmp],16
+ st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[1],[tmp],16
+ st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[2],[tmp],16
+ st4 {WORD0.s,WORD1.s,WORD2.s,WORD3.s}[3],[tmp],16
+ .endif
+
+ .if \windex == 4
+ mov tmp,dataptr
+ st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[0],[tmp],16
+ st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[1],[tmp],16
+ st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[2],[tmp],16
+ st4 {WORD4.s,WORD5.s,WORD6.s,WORD7.s}[3],[tmp],16
+ .endif
+
+ .if \windex == 8
+ mov tmp,dataptr
+ st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[0],[tmp],16
+ st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[1],[tmp],16
+ st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[2],[tmp],16
+ st4 {WORD8.s,WORD9.s,WORD10.s,WORD11.s}[3],[tmp],16
+ .endif
+
+ .if \windex == 12
+ mov tmp,dataptr
+ st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[0],[tmp],16
+ st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[1],[tmp],16
+ st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[2],[tmp],16
+ st4 {WORD12.s,WORD13.s,WORD14.s,WORD15.s}[3],[tmp],16
+ .endif
+.endm
+
+.macro load_x4_word idx:req
+ internal_load \idx
+ ld1 {WORD\idx\().16b},[dataptr],16
+.endm
+
+/*
+ * void sha1_mb_asimd_x4(SHA1_JOB *j0, SHA1_JOB*j1, SHA1_JOB*j2, SHA1_JOB *j3, int blocks)
+ */
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ job3 .req x3
+ num_blocks .req w4
+ tmp .req x5
+ data0 .req x6
+ data1 .req x7
+ data2 .req x8
+ data3 .req x9
+ databuf .req x10
+ dataptr .req x11
+ savedsp .req x12
+
+ .global sha1_mb_asimd_x4
+ .type sha1_mb_asimd_x4, %function
+sha1_mb_asimd_x4:
+ cmp num_blocks, #0
+ beq .return
+ sha1_asimd_save_stack
+ mov savedsp,sp
+ sub databuf,sp,256
+ mov tmp,63
+ bic databuf,databuf,tmp
+ mov sp,databuf
+
+ add tmp,job0,64
+ ld4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16
+ ld1 {VE.s}[0],[tmp]
+ ldr data0,[job0]
+
+ add tmp,job1,64
+ ld4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16
+ ld1 {VE.s}[1],[tmp]
+ ldr data1,[job1]
+
+ add tmp,job2,64
+ ld4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16
+ ld1 {VE.s}[2],[tmp]
+ ldr data2,[job2]
+
+ add tmp,job3,64
+ ld4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16
+ ld1 {VE.s}[3],[tmp]
+ ldr data3,[job3]
+
+.block_loop:
+ mov dataptr,databuf
+ sha1_single
+ subs num_blocks, num_blocks, 1
+ bne .block_loop
+
+ add tmp,job0,64
+ st4 {VA.s,VB.s,VC.s,VD.s}[0],[tmp],#16
+ st1 {VE.s}[0],[tmp]
+
+ add tmp,job1,64
+ st4 {VA.s,VB.s,VC.s,VD.s}[1],[tmp],#16
+ st1 {VE.s}[1],[tmp]
+
+ add tmp,job2,64
+ st4 {VA.s,VB.s,VC.s,VD.s}[2],[tmp],#16
+ st1 {VE.s}[2],[tmp]
+
+ add tmp,job3,64
+ st4 {VA.s,VB.s,VC.s,VD.s}[3],[tmp],#16
+ st1 {VE.s}[3],[tmp]
+
+ mov sp,savedsp
+ sha1_asimd_restore_stack
+.return:
+ ret
+
+ .size sha1_mb_asimd_x4, .-sha1_mb_asimd_x4
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+KEY_0:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+KEY_1:
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+KEY_2:
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+KEY_3:
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c
new file mode 100644
index 000000000..4b34e7b53
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_asimd.c
@@ -0,0 +1,217 @@
+/**********************************************************************
+ Copyright(c) 2021 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha1_mb.h>
+#include <assert.h>
+#include "endian_helper.h"
+
+extern void sha1_aarch64_x1(const uint8_t * data, int num_blocks, uint32_t digest[]);
+static inline void sha1_job_x1(SHA1_JOB * job, int blocks)
+{
+ sha1_aarch64_x1(job->buffer, blocks, job->result_digest);
+}
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA1_MB_ASIMD_MAX_LANES 4
+void sha1_mb_asimd_x4(SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, SHA1_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+
+void sha1_mb_mgr_init_asimd(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= SHA1_MB_ASIMD_MAX_LANES - 1 - i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ // lanes > SHA1_MB_ASIMD_MAX_LANES is invalid lane
+ for (; i < SHA1_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i, lanes, blocks;
+ int lane_idx_array[SHA1_MAX_LANES];
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+ lanes = 0, len = 0;
+ for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ if (lanes)
+ len = min(len, state->lens[i]);
+ else
+ len = state->lens[i];
+ lane_idx_array[lanes] = i;
+ lanes++;
+ }
+ }
+
+ if (lanes == 0)
+ return -1;
+ lane_idx = len & 0xf;
+ len = len & (~0xf);
+ blocks = len >> 4;
+
+ /* for less-than-3-lane job, ASIMD really does not have much advantage
+ * compared to scalar due to wasted >= 50% capacity
+ * therefore we only run ASIMD for 3/4 lanes of data
+ */
+ if (lanes == SHA1_MB_ASIMD_MAX_LANES) {
+ sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane,
+ state->ldata[lane_idx_array[3]].job_in_lane, blocks);
+ } else if (lanes == 3) {
+ /* in case of 3 lanes, apparently ASIMD will still operate as if
+ * there were four lanes of data in processing (waste 25% capacity)
+ * theoretically we can let ASIMD implementation know the number of lanes
+ * so that it could "at least" save some memory loading time
+ * but in practice, we can just pass lane 0 as dummy for similar
+ * cache performance
+ */
+ SHA1_JOB dummy;
+ dummy.buffer = state->ldata[lane_idx_array[0]].job_in_lane->buffer;
+ dummy.len = state->ldata[lane_idx_array[0]].job_in_lane->len;
+ sha1_mb_asimd_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+ &dummy,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane, blocks);
+ } else {
+ sha1_job_x1(state->ldata[lane_idx_array[0]].job_in_lane, blocks);
+ if (lanes >= 2) {
+ sha1_job_x1(state->ldata[lane_idx_array[1]].job_in_lane, blocks);
+ }
+ }
+
+ // only return the min length job
+ for (i = 0; i < SHA1_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+ return lane_idx;
+
+}
+
+static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state)
+{
+ int i;
+ SHA1_JOB *ret = NULL;
+
+ for (i = 0; i < SHA1_MB_ASIMD_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+ int lane_idx;
+ // add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ // fatal error
+ assert(lane_idx < SHA1_MB_ASIMD_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SHA1_JOB *sha1_mb_mgr_submit_asimd(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SHA1_JOB *ret;
+
+ // add job into lanes
+ sha1_mb_mgr_insert_job(state, job);
+
+ ret = sha1_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ // submit will wait all lane has data
+ if (state->num_lanes_inuse < SHA1_MB_ASIMD_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sha1_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sha1_mb_mgr_do_jobs(state);
+#endif
+
+ // ~ i = lane_idx;
+ ret = sha1_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SHA1_JOB *sha1_mb_mgr_flush_asimd(SHA1_MB_JOB_MGR * state)
+{
+ SHA1_JOB *ret;
+ ret = sha1_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sha1_mb_mgr_do_jobs(state);
+ return sha1_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c
new file mode 100644
index 000000000..1dfd67d0c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_mgr_ce.c
@@ -0,0 +1,208 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha1_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA1_MB_CE_MAX_LANES 2
+#if SHA1_MB_CE_MAX_LANES >=2
+void sha1_mb_ce_x2(SHA1_JOB *, SHA1_JOB *, int);
+#endif
+void sha1_mb_ce_x1(SHA1_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sha1_mb_mgr_init_ce(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > SHA1_MB_CE_MAX_LANES is invalid lane
+ for (; i < SHA1_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sha1_mb_mgr_do_jobs(SHA1_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i, lanes;
+
+ int lane_idx_array[SHA1_MAX_LANES];
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+#if SHA1_MB_CE_MAX_LANES == 2
+ if (state->num_lanes_inuse == 2) {
+ len = min(state->lens[0], state->lens[1]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sha1_mb_ce_x2(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane, len >> 4);
+
+ } else
+#endif
+ {
+ lanes = 0, len = 0;
+ for (i = 0; i < SHA1_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ if (lanes)
+ len = min(len, state->lens[i]);
+ else
+ len = state->lens[i];
+ lane_idx_array[lanes] = i;
+ lanes++;
+ }
+ }
+ if (lanes == 0)
+ return -1;
+ lane_idx = len & 0xf;
+ len = len & (~0xf);
+
+#if SHA1_MB_CE_MAX_LANES >=2
+ if (lanes == 2) {
+ sha1_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+ } else
+#endif
+ {
+ sha1_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+ }
+ }
+ //only return the min length job
+ for (i = 0; i < SHA1_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+
+ return lane_idx;
+
+}
+
+static SHA1_JOB *sha1_mb_mgr_free_lane(SHA1_MB_JOB_MGR * state)
+{
+ int i;
+ SHA1_JOB *ret = NULL;
+
+ for (i = 0; i < SHA1_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sha1_mb_mgr_insert_job(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ //fatal error
+ assert(lane_idx < SHA1_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SHA1_JOB *sha1_mb_mgr_submit_ce(SHA1_MB_JOB_MGR * state, SHA1_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SHA1_JOB *ret;
+
+ //add job into lanes
+ sha1_mb_mgr_insert_job(state, job);
+
+ ret = sha1_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < SHA1_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sha1_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sha1_mb_mgr_do_jobs(state);
+#endif
+
+ //~ i = lane_idx;
+ ret = sha1_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SHA1_JOB *sha1_mb_mgr_flush_ce(SHA1_MB_JOB_MGR * state)
+{
+ SHA1_JOB *ret;
+ ret = sha1_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sha1_mb_mgr_do_jobs(state);
+ return sha1_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S
new file mode 100644
index 000000000..bb1929d76
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface sha1_ctx_mgr_submit
+mbin_interface sha1_ctx_mgr_init
+mbin_interface sha1_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S
new file mode 100644
index 000000000..22f736793
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x1_ce.S
@@ -0,0 +1,194 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+
+/**
+maros for round 4-67
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req
+ sha1h \e0\()_s, \abcd\()_s
+ \inst \abcd\()_q,\e1\()_s,\tmp1\()_v.4s
+ add \tmp1\()_v.4s,\msg3\()_v.4s,\k\()_v.4s
+ sha1su1 \msg0\()_v.4s,\msg3\()_v.4s
+ sha1su0 \msg1\()_v.4s,\msg2\()_v.4s,\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key_0,28
+ declare_var_vector_reg key_1,29
+ declare_var_vector_reg key_2,30
+ declare_var_vector_reg key_3,31
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg abcd,0
+ declare_var_vector_reg e0,1
+ declare_var_vector_reg e1,2
+ declare_var_vector_reg abcd_saved,3
+ declare_var_vector_reg e0_saved,4
+/*
+Message variables
+*/
+ declare_var_vector_reg msg_0,16
+ declare_var_vector_reg msg_1,17
+ declare_var_vector_reg msg_2,18
+ declare_var_vector_reg msg_3,19
+/*
+Temporay variables
+*/
+ declare_var_vector_reg tmp_0,5
+ declare_var_vector_reg tmp_1,6
+
+/*
+ void sha1_mb_ce_x1(SHA1_JOB * job, int len);
+*/
+/*
+Arguements list
+*/
+ job .req x0
+ len .req w1
+ data .req x2
+ tmp .req x3
+ .global sha1_mb_ce_x1
+ .type sha1_mb_ce_x1, %function
+sha1_mb_ce_x1:
+ ldr data, [job]
+ ldr abcd_q, [job, 64]
+ ldr e0_s, [job, 80]
+ adr tmp, KEY
+ ld1 {key_0_v.4s-key_3_v.4s},[tmp]
+
+start_loop:
+
+ //load msgs
+ ld1 {msg_0_v.4s-msg_3_v.4s},[data]
+
+ //adjust loop parameter
+ add data,data,64
+ sub len, len, #1
+ cmp len, 0
+ //backup digest
+ mov abcd_saved_v.16b,abcd_v.16b
+ mov e0_saved_v.16b,e0_v.16b
+
+ rev32 msg_0_v.16b,msg_0_v.16b
+ rev32 msg_1_v.16b,msg_1_v.16b
+ add tmp_0_v.4s,msg_0_v.4s,key_0_v.4s
+ rev32 msg_2_v.16b,msg_2_v.16b
+ add tmp_1_v.4s,msg_1_v.4s,key_0_v.4s
+ rev32 msg_3_v.16b,msg_3_v.16b
+
+ /* rounds 0-3 */
+ sha1h e1_s,abcd_s
+ sha1c abcd_q,e0_s,tmp_0_v.4s
+ add tmp_0_v.4s,msg_2_v.4s,key_0_v.4s
+ sha1su0 msg_0_v.4s,msg_1_v.4s,msg_2_v.4s
+
+ sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0 /* rounds 4-7 */
+ sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0
+ sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 12-15 */
+ sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 20-23 */
+ sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2 /* rounds 36-39 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2
+ sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3 /* rounds 52-55 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3
+
+ /* rounds 68-71 */
+ sha1h e0_s,abcd_s
+ sha1p abcd_q,e1_s,tmp_1_v.4s
+ add tmp_1_v.4s,msg_3_v.4s,key_3_v.4s
+ sha1su1 msg_0_v.4s,msg_3_v.4s
+
+ /* rounds 72-75 */
+ sha1h e1_s,abcd_s
+ sha1p abcd_q,e0_s,tmp_0_v.4s
+
+ /* rounds 76-79 */
+ sha1h e0_s,abcd_s
+ sha1p abcd_q,e1_s,tmp_1_v.4s
+
+
+
+ add abcd_v.4s,abcd_v.4s,abcd_saved_v.4s
+ add e0_v.2s,e0_v.2s,e0_saved_v.2s
+
+
+ bgt start_loop
+ str abcd_q, [job, 64]
+ str e0_s, [job, 80]
+
+ ret
+
+ .size sha1_mb_ce_x1, .-sha1_mb_ce_x1
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S
new file mode 100644
index 000000000..93f653ad2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/aarch64/sha1_mb_x2_ce.S
@@ -0,0 +1,253 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+
+/**
+maros for round 4-67
+*/
+.macro sha1_4_rounds inst:req,msg0:req,msg1:req,msg2:req,msg3:req,abcd:req,e0:req,tmp0:req,e1:req,tmp1:req,k:req
+ sha1h l0_\e0\()_s, l0_\abcd\()_s
+ sha1h l1_\e0\()_s, l1_\abcd\()_s
+
+ \inst l0_\abcd\()_q,l0_\e1\()_s,l0_\tmp1\()_v.4s
+ \inst l1_\abcd\()_q,l1_\e1\()_s,l1_\tmp1\()_v.4s
+
+ add l0_\tmp1\()_v.4s,l0_\msg3\()_v.4s,\k\()_v.4s
+ add l1_\tmp1\()_v.4s,l1_\msg3\()_v.4s,\k\()_v.4s
+
+ sha1su1 l0_\msg0\()_v.4s,l0_\msg3\()_v.4s
+ sha1su1 l1_\msg0\()_v.4s,l1_\msg3\()_v.4s
+
+ sha1su0 l0_\msg1\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+ sha1su0 l1_\msg1\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key_0,28
+ declare_var_vector_reg key_1,29
+ declare_var_vector_reg key_2,30
+ declare_var_vector_reg key_3,31
+
+
+/*
+lane variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_e0,1
+ declare_var_vector_reg l0_e1,2
+ declare_var_vector_reg l0_abcd_saved,3
+ declare_var_vector_reg l0_e0_saved,4
+ declare_var_vector_reg l0_tmp_0,5
+ declare_var_vector_reg l0_tmp_1,6
+ declare_var_vector_reg l0_msg_0,16
+ declare_var_vector_reg l0_msg_1,17
+ declare_var_vector_reg l0_msg_2,18
+ declare_var_vector_reg l0_msg_3,19
+
+ declare_var_vector_reg l1_abcd,7
+ declare_var_vector_reg l1_e0,8
+ declare_var_vector_reg l1_e1,9
+ declare_var_vector_reg l1_abcd_saved,24
+ declare_var_vector_reg l1_e0_saved,25
+ declare_var_vector_reg l1_tmp_0,26
+ declare_var_vector_reg l1_tmp_1,27
+ declare_var_vector_reg l1_msg_0,20
+ declare_var_vector_reg l1_msg_1,21
+ declare_var_vector_reg l1_msg_2,22
+ declare_var_vector_reg l1_msg_3,23
+
+/*
+ void sha1_mb_ce_x2(SHA1_JOB * job_0, SHA1_JOB * job_1,int len);
+*/
+ l0_job .req x0
+ l1_job .req x1
+ len .req w2
+
+ l0_data .req x3
+ l1_data .req x4
+ tmp .req x5
+ .global sha1_mb_ce_x2
+ .type sha1_mb_ce_x2, %function
+sha1_mb_ce_x2:
+ //push d8,d9 to stack
+ stp d8, d9, [sp, -256]!
+
+ adr tmp, KEY
+ ld1 {key_0_v.4s-key_3_v.4s},[tmp]
+ ldr l0_data, [l0_job]
+ ldr l1_data, [l1_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_e0_s, [l0_job, 80]
+ ldr l1_abcd_q, [l1_job, 64]
+ ldr l1_e0_s, [l1_job, 80]
+
+start_loop:
+
+ //load msgs
+ ld1 {l0_msg_0_v.4s-l0_msg_3_v.4s},[l0_data]
+ ld1 {l1_msg_0_v.4s-l1_msg_3_v.4s},[l1_data]
+
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ add l1_data,l1_data,64
+ sub len, len, #1
+ cmp len, 0
+ //backup digest
+ mov l0_abcd_saved_v.16b, l0_abcd_v.16b
+ mov l0_e0_saved_v.16b, l0_e0_v.16b
+ mov l1_abcd_saved_v.16b, l1_abcd_v.16b
+ mov l1_e0_saved_v.16b, l1_e0_v.16b
+
+ rev32 l0_msg_0_v.16b, l0_msg_0_v.16b
+ rev32 l0_msg_1_v.16b, l0_msg_1_v.16b
+ add l0_tmp_0_v.4s, l0_msg_0_v.4s, key_0_v.4s
+ rev32 l0_msg_2_v.16b, l0_msg_2_v.16b
+ add l0_tmp_1_v.4s, l0_msg_1_v.4s, key_0_v.4s
+ rev32 l0_msg_3_v.16b, l0_msg_3_v.16b
+
+ rev32 l1_msg_0_v.16b, l1_msg_0_v.16b
+ rev32 l1_msg_1_v.16b, l1_msg_1_v.16b
+ add l1_tmp_0_v.4s, l1_msg_0_v.4s, key_0_v.4s
+ rev32 l1_msg_2_v.16b, l1_msg_2_v.16b
+ add l1_tmp_1_v.4s, l1_msg_1_v.4s, key_0_v.4s
+ rev32 l1_msg_3_v.16b, l1_msg_3_v.16b
+
+ /* rounds 0-3 */
+ sha1h l0_e1_s, l0_abcd_s
+ sha1c l0_abcd_q, l0_e0_s, l0_tmp_0_v.4s
+ add l0_tmp_0_v.4s, l0_msg_2_v.4s, key_0_v.4s
+ sha1su0 l0_msg_0_v.4s, l0_msg_1_v.4s, l0_msg_2_v.4s
+
+ sha1h l1_e1_s, l1_abcd_s
+ sha1c l1_abcd_q, l1_e0_s, l1_tmp_0_v.4s
+ add l1_tmp_0_v.4s, l1_msg_2_v.4s, key_0_v.4s
+ sha1su0 l1_msg_0_v.4s, l1_msg_1_v.4s, l1_msg_2_v.4s
+
+ sha1_4_rounds sha1c,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_0 /* rounds 4-7 */
+ sha1_4_rounds sha1c,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_0
+ sha1_4_rounds sha1c,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 12-15 */
+ sha1_4_rounds sha1c,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_1
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_1 /* rounds 20-23 */
+ sha1_4_rounds sha1p,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_1
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_1
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1p,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_2 /* rounds 36-39 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1m,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_2
+ sha1_4_rounds sha1m,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_2
+ sha1_4_rounds sha1m,msg_0,msg_1,msg_2,msg_3,abcd,e0,tmp_0,e1,tmp_1,key_3 /* rounds 52-55 */
+ sha1_4_rounds sha1m,msg_1,msg_2,msg_3,msg_0,abcd,e1,tmp_1,e0,tmp_0,key_3
+ sha1_4_rounds sha1p,msg_2,msg_3,msg_0,msg_1,abcd,e0,tmp_0,e1,tmp_1,key_3
+ sha1_4_rounds sha1p,msg_3,msg_0,msg_1,msg_2,abcd,e1,tmp_1,e0,tmp_0,key_3
+
+ /* rounds 68-71 */
+ sha1h l0_e0_s, l0_abcd_s
+ sha1p l0_abcd_q, l0_e1_s, l0_tmp_1_v.4s
+ add l0_tmp_1_v.4s, l0_msg_3_v.4s, key_3_v.4s
+ sha1su1 l0_msg_0_v.4s, l0_msg_3_v.4s
+
+ sha1h l1_e0_s, l1_abcd_s
+ sha1p l1_abcd_q, l1_e1_s, l1_tmp_1_v.4s
+ add l1_tmp_1_v.4s, l1_msg_3_v.4s, key_3_v.4s
+ sha1su1 l1_msg_0_v.4s, l1_msg_3_v.4s
+
+ /* rounds 72-75 */
+ sha1h l0_e1_s, l0_abcd_s
+ sha1p l0_abcd_q, l0_e0_s, l0_tmp_0_v.4s
+
+ sha1h l1_e1_s, l1_abcd_s
+ sha1p l1_abcd_q, l1_e0_s, l1_tmp_0_v.4s
+
+ /* rounds 76-79 */
+ sha1h l0_e0_s, l0_abcd_s
+ sha1p l0_abcd_q, l0_e1_s, l0_tmp_1_v.4s
+
+ sha1h l1_e0_s, l1_abcd_s
+ sha1p l1_abcd_q, l1_e1_s, l1_tmp_1_v.4s
+
+
+
+ add l0_abcd_v.4s, l0_abcd_v.4s, l0_abcd_saved_v.4s
+ add l0_e0_v.2s, l0_e0_v.2s, l0_e0_saved_v.2s
+ add l1_abcd_v.4s, l1_abcd_v.4s, l1_abcd_saved_v.4s
+ add l1_e0_v.2s, l1_e0_v.2s, l1_e0_saved_v.2s
+
+
+
+
+ bgt start_loop
+
+ str l0_abcd_q, [l0_job, 64]
+ str l0_e0_s, [l0_job, 80]
+
+
+ str l1_abcd_q, [l1_job, 64]
+ str l1_e0_s, [l1_job, 80]
+
+ //pop d8,d9 from stack
+ ldp d8, d9, [sp], 256
+ ret
+
+ .size sha1_mb_ce_x2, .-sha1_mb_ce_x2
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x5a827999
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x6ed9eba1
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0x8f1bbcdc
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
+ .word 0xca62c1d6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c
new file mode 100644
index 000000000..ad91d64ac
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c
@@ -0,0 +1,265 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx_slver_02020142;
+struct slver sha1_ctx_mgr_init_avx_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_avx_slver_02020143;
+struct slver sha1_ctx_mgr_submit_avx_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_avx_slver_02020144;
+struct slver sha1_ctx_mgr_flush_avx_slver = { 0x0144, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c
new file mode 100644
index 000000000..85977d4c2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c
@@ -0,0 +1,264 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx2(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx2(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx2(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx2_slver_04020145;
+struct slver sha1_ctx_mgr_init_avx2_slver = { 0x0145, 0x02, 0x04 };
+
+struct slver sha1_ctx_mgr_submit_avx2_slver_04020146;
+struct slver sha1_ctx_mgr_submit_avx2_slver = { 0x0146, 0x02, 0x04 };
+
+struct slver sha1_ctx_mgr_flush_avx2_slver_04020147;
+struct slver sha1_ctx_mgr_flush_avx2_slver = { 0x0147, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c
new file mode 100644
index 000000000..90e087163
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c
@@ -0,0 +1,271 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx512(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx512_slver_0600014a;
+struct slver sha1_ctx_mgr_init_avx512_slver = { 0x014a, 0x00, 0x06 };
+
+struct slver sha1_ctx_mgr_submit_avx512_slver_0600014b;
+struct slver sha1_ctx_mgr_submit_avx512_slver = { 0x014b, 0x00, 0x06 };
+
+struct slver sha1_ctx_mgr_flush_avx512_slver_0600014c;
+struct slver sha1_ctx_mgr_flush_avx512_slver = { 0x014c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c
new file mode 100644
index 000000000..2013f829a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512_ni.c
@@ -0,0 +1,281 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+/**
+ * sha1_ctx_avx512_ni related functions are aiming to utilize Canon Lake.
+ * Since SHANI is still slower than multibuffer for full lanes,
+ * sha1_ctx_mgr_init_avx512_ni and sha1_ctx_mgr_submit_avx512_ni are
+ * similar with their avx512 versions.
+ * sha1_ctx_mgr_flush_avx512_ni is different. It will call
+ * sha1_mb_mgr_flush_avx512_ni which would use shani when lanes are less
+ * than a threshold.
+ *
+ */
+#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI)
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx512_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512_ni(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx512_ni_slver_080002c4;
+struct slver sha1_ctx_mgr_init_avx512_ni_slver = { 0x02c4, 0x00, 0x08 };
+
+struct slver sha1_ctx_mgr_submit_avx512_ni_slver_080002c5;
+struct slver sha1_ctx_mgr_submit_avx512_ni_slver = { 0x02c5, 0x00, 0x08 };
+
+struct slver sha1_ctx_mgr_flush_avx512_ni_slver_080002c6;
+struct slver sha1_ctx_mgr_flush_avx512_ni_slver = { 0x02c6, 0x00, 0x08 };
+
+#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c
new file mode 100644
index 000000000..90481efd0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base.c
@@ -0,0 +1,325 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+ if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ else W(i) = to_be32(ww[i]); \
+ e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+ b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+ b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+ b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+ b = rol32(b,30)
+
+static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sha1_single(const void *data, uint32_t digest[]);
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+
+void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr)
+{
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ uint32_t remain_len;
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+ // Cannot submit a new entire job to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags == HASH_FIRST) {
+
+ sha1_init(ctx, buffer, len);
+ sha1_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_UPDATE) {
+ sha1_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_LAST) {
+ remain_len = sha1_update(ctx, buffer, len);
+ sha1_final(ctx, remain_len);
+ }
+
+ if (flags == HASH_ENTIRE) {
+ sha1_init(ctx, buffer, len);
+ remain_len = sha1_update(ctx, buffer, len);
+ sha1_final(ctx, remain_len);
+ }
+
+ return ctx;
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr)
+{
+ return NULL;
+}
+
+static void sha1_init(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Mark it as processing
+ ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sha1_update(SHA1_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ uint32_t remain_len = len;
+ uint32_t *digest = ctx->job.result_digest;
+
+ while (remain_len >= SHA1_BLOCK_SIZE) {
+ sha1_single(buffer, digest);
+ buffer = (void *)((uint8_t *) buffer + SHA1_BLOCK_SIZE);
+ remain_len -= SHA1_BLOCK_SIZE;
+ ctx->total_length += SHA1_BLOCK_SIZE;
+ }
+
+ ctx->status = HASH_CTX_STS_IDLE;
+ ctx->incoming_buffer = buffer;
+ return remain_len;
+}
+
+static void sha1_final(SHA1_HASH_CTX * ctx, uint32_t remain_len)
+{
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t i = remain_len, j;
+ uint8_t buf[2 * SHA1_BLOCK_SIZE];
+ uint32_t *digest = ctx->job.result_digest;
+
+ ctx->total_length += i;
+ memcpy(buf, buffer, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA1_BLOCK_SIZE;
+ else
+ i = SHA1_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+ sha1_single(buf, digest);
+ if (i == 2 * SHA1_BLOCK_SIZE) {
+ sha1_single(buf + SHA1_BLOCK_SIZE, digest);
+ }
+
+ ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+void sha1_single(const void *data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e;
+ uint32_t w[16] = { 0 };
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+
+ step00_19(0, a, b, c, d, e);
+ step00_19(1, e, a, b, c, d);
+ step00_19(2, d, e, a, b, c);
+ step00_19(3, c, d, e, a, b);
+ step00_19(4, b, c, d, e, a);
+ step00_19(5, a, b, c, d, e);
+ step00_19(6, e, a, b, c, d);
+ step00_19(7, d, e, a, b, c);
+ step00_19(8, c, d, e, a, b);
+ step00_19(9, b, c, d, e, a);
+ step00_19(10, a, b, c, d, e);
+ step00_19(11, e, a, b, c, d);
+ step00_19(12, d, e, a, b, c);
+ step00_19(13, c, d, e, a, b);
+ step00_19(14, b, c, d, e, a);
+ step00_19(15, a, b, c, d, e);
+ step00_19(16, e, a, b, c, d);
+ step00_19(17, d, e, a, b, c);
+ step00_19(18, c, d, e, a, b);
+ step00_19(19, b, c, d, e, a);
+
+ step20_39(20, a, b, c, d, e);
+ step20_39(21, e, a, b, c, d);
+ step20_39(22, d, e, a, b, c);
+ step20_39(23, c, d, e, a, b);
+ step20_39(24, b, c, d, e, a);
+ step20_39(25, a, b, c, d, e);
+ step20_39(26, e, a, b, c, d);
+ step20_39(27, d, e, a, b, c);
+ step20_39(28, c, d, e, a, b);
+ step20_39(29, b, c, d, e, a);
+ step20_39(30, a, b, c, d, e);
+ step20_39(31, e, a, b, c, d);
+ step20_39(32, d, e, a, b, c);
+ step20_39(33, c, d, e, a, b);
+ step20_39(34, b, c, d, e, a);
+ step20_39(35, a, b, c, d, e);
+ step20_39(36, e, a, b, c, d);
+ step20_39(37, d, e, a, b, c);
+ step20_39(38, c, d, e, a, b);
+ step20_39(39, b, c, d, e, a);
+
+ step40_59(40, a, b, c, d, e);
+ step40_59(41, e, a, b, c, d);
+ step40_59(42, d, e, a, b, c);
+ step40_59(43, c, d, e, a, b);
+ step40_59(44, b, c, d, e, a);
+ step40_59(45, a, b, c, d, e);
+ step40_59(46, e, a, b, c, d);
+ step40_59(47, d, e, a, b, c);
+ step40_59(48, c, d, e, a, b);
+ step40_59(49, b, c, d, e, a);
+ step40_59(50, a, b, c, d, e);
+ step40_59(51, e, a, b, c, d);
+ step40_59(52, d, e, a, b, c);
+ step40_59(53, c, d, e, a, b);
+ step40_59(54, b, c, d, e, a);
+ step40_59(55, a, b, c, d, e);
+ step40_59(56, e, a, b, c, d);
+ step40_59(57, d, e, a, b, c);
+ step40_59(58, c, d, e, a, b);
+ step40_59(59, b, c, d, e, a);
+
+ step60_79(60, a, b, c, d, e);
+ step60_79(61, e, a, b, c, d);
+ step60_79(62, d, e, a, b, c);
+ step60_79(63, c, d, e, a, b);
+ step60_79(64, b, c, d, e, a);
+ step60_79(65, a, b, c, d, e);
+ step60_79(66, e, a, b, c, d);
+ step60_79(67, d, e, a, b, c);
+ step60_79(68, c, d, e, a, b);
+ step60_79(69, b, c, d, e, a);
+ step60_79(70, a, b, c, d, e);
+ step60_79(71, e, a, b, c, d);
+ step60_79(72, d, e, a, b, c);
+ step60_79(73, c, d, e, a, b);
+ step60_79(74, b, c, d, e, a);
+ step60_79(75, a, b, c, d, e);
+ step60_79(76, e, a, b, c, d);
+ step60_79(77, d, e, a, b, c);
+ step60_79(78, c, d, e, a, b);
+ step60_79(79, b, c, d, e, a);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+struct slver sha1_ctx_mgr_init_base_slver_00000192;
+struct slver sha1_ctx_mgr_init_base_slver = { 0x0192, 0x00, 0x00 };
+
+struct slver sha1_ctx_mgr_submit_base_slver_00000193;
+struct slver sha1_ctx_mgr_submit_base_slver = { 0x0193, 0x00, 0x00 };
+
+struct slver sha1_ctx_mgr_flush_base_slver_00000194;
+struct slver sha1_ctx_mgr_flush_base_slver = { 0x0194, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c
new file mode 100644
index 000000000..32eb07f6e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+
+extern void sha1_ctx_mgr_init_base(SHA1_HASH_CTX_MGR * mgr);
+extern SHA1_HASH_CTX *sha1_ctx_mgr_submit_base(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags);
+extern SHA1_HASH_CTX *sha1_ctx_mgr_flush_base(SHA1_HASH_CTX_MGR * mgr);
+
+void sha1_ctx_mgr_init(SHA1_HASH_CTX_MGR * mgr)
+{
+ return sha1_ctx_mgr_init_base(mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ return sha1_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush(SHA1_HASH_CTX_MGR * mgr)
+{
+ return sha1_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c
new file mode 100644
index 000000000..db70ee015
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c
@@ -0,0 +1,251 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_sse(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_sse_slver_00020139;
+struct slver sha1_ctx_mgr_init_sse_slver = { 0x0139, 0x02, 0x00 };
+
+struct slver sha1_ctx_mgr_submit_sse_slver_00020140;
+struct slver sha1_ctx_mgr_submit_sse_slver = { 0x0140, 0x02, 0x00 };
+
+struct slver sha1_ctx_mgr_flush_sse_slver_00020141;
+struct slver sha1_ctx_mgr_flush_sse_slver = { 0x0141, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c
new file mode 100644
index 000000000..d3c7687d2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse_ni.c
@@ -0,0 +1,259 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_SHANI
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_sse_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+ // Same with sse
+ sha1_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse_ni(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse_ni(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse_ni(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse_ni(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA1_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_sse_ni_slver_070002c1;
+struct slver sha1_ctx_mgr_init_sse_ni_slver = { 0x02c1, 0x00, 0x07 };
+
+struct slver sha1_ctx_mgr_submit_sse_ni_slver_070002c2;
+struct slver sha1_ctx_mgr_submit_sse_ni_slver = { 0x02c2, 0x00, 0x07 };
+
+struct slver sha1_ctx_mgr_flush_sse_ni_slver_070002c3;
+struct slver sha1_ctx_mgr_flush_sse_ni_slver = { 0x02c3, 0x00, 0x07 };
+
+#endif // HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm
new file mode 100644
index 000000000..1c9a66fd4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm
@@ -0,0 +1,67 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SHA1_SB_THRESHOLD_SSE 1
+%define SHA1_SB_THRESHOLD_AVX 1
+%define SHA1_SB_THRESHOLD_AVX2 1
+%define SHA1_SB_THRESHOLD_AVX512 1
+%define SHA1_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha1_mb
+%define SHA1_NI_SB_THRESHOLD_AVX512 6
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA1_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA1_JOB
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 4, 4 ; length in bytes
+FIELD _result_digest, 5*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+%assign _SHA1_JOB_size _FIELD_OFFSET
+%assign _SHA1_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c
new file mode 100644
index 000000000..4bf2e09b5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_flush_test.c
@@ -0,0 +1,146 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS (SHA1_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SHA1_HASH_CTX_MGR * mgr)
+{
+ static int32_t last_lens[SHA1_MAX_LANES] = { 0 };
+ int32_t len;
+ uint8_t num_unchanged = 0;
+ int i;
+ for (i = 0; i < SHA1_MAX_LANES; i++) {
+ len = (int32_t) mgr->mgr.lens[i];
+ // len[i] in mgr consists of byte_length<<4 | lane_index
+ len = (len >= 16) ? (len >> 4 << 6) : 0;
+ printf("\t%d", len);
+ if (last_lens[i] > 0 && last_lens[i] == len)
+ num_unchanged += 1;
+ last_lens[i] = len;
+ }
+ printf("\n");
+ return num_unchanged;
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ uint8_t num_ret, num_unchanged = 0;
+ int ret;
+
+ printf("sha1_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ lens[i] = TEST_LEN / SHA1_MAX_LANES * (i + 1);
+ bufs[i] = (unsigned char *)malloc(lens[i]);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], lens[i]);
+ }
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ printf("Changes of lens inside mgr:\n");
+ lens_print_and_check(mgr);
+ while (sha1_ctx_mgr_flush(mgr)) {
+ num_ret = lens_print_and_check(mgr);
+ num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+ }
+ printf("Info of sha1_mb lens prints over\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else if (num_unchanged)
+ printf("SHA-NI is used when %d or %d jobs are uncompleted\n",
+ num_unchanged, num_unchanged + 1);
+ else
+ printf("SHA-NI is not used, or used for last job\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..21c81403b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA1 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA1_ARGS_X16
+;;; name size align
+FIELD _digest, 4*5*16, 16 ; transposed digest
+FIELD _data_ptr, 8*16, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SHA1_ARGS_X4_size _FIELD_OFFSET
+%assign _SHA1_ARGS_X4_align _STRUCT_ALIGN
+%assign _SHA1_ARGS_X8_size _FIELD_OFFSET
+%assign _SHA1_ARGS_X8_align _STRUCT_ALIGN
+%assign _SHA1_ARGS_X16_size _FIELD_OFFSET
+%assign _SHA1_ARGS_X16_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
+FIELD _lens, 4*16, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..c5fd71300
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm
@@ -0,0 +1,247 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_avx
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*2
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx, function
+sha1_mb_mgr_flush_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; avx has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..a47ae2838
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm
@@ -0,0 +1,273 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x8_avx2
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha1_mb_x8_avx2 and sha1_opt_x1
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx2(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx2, function
+sha1_mb_mgr_flush_avx2:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX2
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x2000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*32]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..5e3db5b9b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm
@@ -0,0 +1,271 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha1_mb_x16_avx512
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state arg1 ; unchanged
+%define job arg2 ; unused
+%define len2 arg2 ; unused
+
+; idx must be a register not clobberred by sha1_mb_x16_avx512
+%define idx rbp ; unchanged
+
+%define unused_lanes rbx ; covered
+%define lane_data rbx ; covered
+%define tmp2 rbx ; covered
+
+%define num_lanes_inuse r9 ; covered
+
+%define job_rax rax ; covered
+%define tmp rax ; unused
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx512, function
+sha1_mb_mgr_flush_avx512:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_AVX512
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*64]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_mgr_flush_avx512
+no_sha1_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm
new file mode 100644
index 000000000..4170b6c73
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512_ni.asm
@@ -0,0 +1,278 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+
+extern sha1_mb_x16_avx512
+extern sha1_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state arg1 ; unchanged
+%define job arg2 ; unused
+%define len2 arg2 ; unused
+
+; idx must be a register not clobberred by sha1_mb_x16_avx512
+%define idx rbp ; unchanged
+
+%define unused_lanes rbx ; covered
+%define lane_data rbx ; covered
+%define tmp2 rbx ; covered
+
+%define num_lanes_inuse r9 ; covered
+
+%define job_rax rax ; covered
+%define tmp rax ; unused
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_avx512_ni, function
+sha1_mb_mgr_flush_avx512_ni:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+ cmp dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_AVX512
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_ni_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*64]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+ %else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha1_mb_mgr_flush_avx512_ni
+ no_sha1_mb_mgr_flush_avx512_ni:
+ %endif
+ %endif ; HAVE_AS_KNOWS_SHANI
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+ global no_sha1_mb_mgr_flush_avx512_ni
+ no_sha1_mb_mgr_flush_avx512_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..2a4c4b50a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm
@@ -0,0 +1,249 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_sse
+extern sha1_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*2
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_sse(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_sse, function
+sha1_mb_mgr_flush_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_SB_THRESHOLD_SSE
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm
new file mode 100644
index 000000000..ea3cffd33
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse_ni.asm
@@ -0,0 +1,256 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha1_mb_x4_sse
+extern sha1_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*2
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_sse_ni(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha1_mb_mgr_flush_sse_ni, function
+sha1_mb_mgr_flush_sse_ni:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA1_NI_SB_THRESHOLD_SSE
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_ni_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha1_mb_mgr_flush_sse_ni
+ no_sha1_mb_mgr_flush_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..b6124486a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_avx2(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA1_X8_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..033fb3c9f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_avx512(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA1_MAX_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c
new file mode 100644
index 000000000..811c4a9dd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_sse(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF3210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA1_MIN_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..49c018138
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_avx
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%else
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define last_len rdx
+%define idx rdx
+
+%define size_offset rcx
+%define tmp2 rcx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*4 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_avx(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_avx, function
+sha1_mb_mgr_submit_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*4 + 16*0], xmm6
+ vmovdqa [rsp + 8*4 + 16*1], xmm7
+ vmovdqa [rsp + 8*4 + 16*2], xmm8
+ vmovdqa [rsp + 8*4 + 16*3], xmm9
+ vmovdqa [rsp + 8*4 + 16*4], xmm10
+ vmovdqa [rsp + 8*4 + 16*5], xmm11
+ vmovdqa [rsp + 8*4 + 16*6], xmm12
+ vmovdqa [rsp + 8*4 + 16*7], xmm13
+ vmovdqa [rsp + 8*4 + 16*8], xmm14
+ vmovdqa [rsp + 8*4 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*4 + 16*0]
+ vmovdqa xmm7, [rsp + 8*4 + 16*1]
+ vmovdqa xmm8, [rsp + 8*4 + 16*2]
+ vmovdqa xmm9, [rsp + 8*4 + 16*3]
+ vmovdqa xmm10, [rsp + 8*4 + 16*4]
+ vmovdqa xmm11, [rsp + 8*4 + 16*5]
+ vmovdqa xmm12, [rsp + 8*4 + 16*6]
+ vmovdqa xmm13, [rsp + 8*4 + 16*7]
+ vmovdqa xmm14, [rsp + 8*4 + 16*8]
+ vmovdqa xmm15, [rsp + 8*4 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+H0: dd 0x67452301
+H1: dd 0xefcdab89
+H2: dd 0x98badcfe
+H3: dd 0x10325476
+H4: dd 0xc3d2e1f0
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..95b4f1715
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm
@@ -0,0 +1,250 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "memcpy.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%define extra_blocks rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%define extra_blocks rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+; idx must be a register not clobberred by sha1_x8_avx2
+%define idx r8
+%define last_len r8
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* sha1_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA1 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_avx2, function
+sha1_mb_mgr_submit_avx2:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+
+ shl len,4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+
+ vmovd [state + _args_digest + 4*lane + 0*32], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*32], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x8_avx2
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*32]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..a4f9389a1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm
@@ -0,0 +1,248 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "memcpy.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha1_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state arg1 ; unchanged, mb_x16's input1
+%define job arg2 ; arg2 unused
+%define len2 arg2 ; arg2 unused, mb_x16's input2
+
+; idx must be a register not clobberred by sha1_x16_avx512
+%define idx r8 ; unchanged
+
+%define p r11 ; unused
+
+%define unused_lanes rbx ; covered
+
+%define job_rax rax ; covered
+%define len rax ; unused
+
+%define lane rbp ; unused
+
+%define tmp r9 ; covered
+%define num_lanes_inuse r9 ; covered
+
+%define lane_data r10 ; covered
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* sha1_mb_mgr_submit_avx512(MB_MGR *state, JOB_SHA1 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_avx512, function
+sha1_mb_mgr_submit_avx512:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+
+ shl len,4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+
+ vmovd [state + _args_digest + 4*lane + 0*64], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*64], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF ; idx represent min length index
+ shr len2, 4 ; size in blocks
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x16_avx512
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*64]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_mgr_submit_avx512
+no_sha1_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..9989a9a1d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_sse
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%else
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rdx
+%define idx rdx
+
+%define size_offset rcx
+%define tmp2 rcx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*4 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_sse(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_sse, function
+sha1_mb_mgr_submit_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ movdqa [rsp + 8*4 + 16*0], xmm6
+ movdqa [rsp + 8*4 + 16*1], xmm7
+ movdqa [rsp + 8*4 + 16*2], xmm8
+ movdqa [rsp + 8*4 + 16*3], xmm9
+ movdqa [rsp + 8*4 + 16*4], xmm10
+ movdqa [rsp + 8*4 + 16*5], xmm11
+ movdqa [rsp + 8*4 + 16*6], xmm12
+ movdqa [rsp + 8*4 + 16*7], xmm13
+ movdqa [rsp + 8*4 + 16*8], xmm14
+ movdqa [rsp + 8*4 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 8*4 + 16*0]
+ movdqa xmm7, [rsp + 8*4 + 16*1]
+ movdqa xmm8, [rsp + 8*4 + 16*2]
+ movdqa xmm9, [rsp + 8*4 + 16*3]
+ movdqa xmm10, [rsp + 8*4 + 16*4]
+ movdqa xmm11, [rsp + 8*4 + 16*5]
+ movdqa xmm12, [rsp + 8*4 + 16*6]
+ movdqa xmm13, [rsp + 8*4 + 16*7]
+ movdqa xmm14, [rsp + 8*4 + 16*8]
+ movdqa xmm15, [rsp + 8*4 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+H0: dd 0x67452301
+H1: dd 0xefcdab89
+H2: dd 0x98badcfe
+H3: dd 0x10325476
+H4: dd 0xc3d2e1f0
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm
new file mode 100644
index 000000000..979324de4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse_ni.asm
@@ -0,0 +1,290 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha1_mb_x4_sse
+extern sha1_ni_x2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%else
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rdx
+%define idx rdx
+
+%define size_offset rcx
+%define tmp2 rcx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*6 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_sse_ni(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha1_mb_mgr_submit_sse_ni, function
+sha1_mb_mgr_submit_sse_ni:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ movdqa [rsp + 8*4 + 16*0], xmm6
+ movdqa [rsp + 8*4 + 16*1], xmm7
+ movdqa [rsp + 8*4 + 16*2], xmm8
+ movdqa [rsp + 8*4 + 16*3], xmm9
+ movdqa [rsp + 8*4 + 16*4], xmm10
+ movdqa [rsp + 8*4 + 16*5], xmm11
+ movdqa [rsp + 8*4 + 16*6], xmm12
+ movdqa [rsp + 8*4 + 16*7], xmm13
+ movdqa [rsp + 8*4 + 16*8], xmm14
+ movdqa [rsp + 8*4 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+
+ cmp unused_lanes, 0xF32 ; we will process two jobs at the same time
+ jne return_null ; wait for another sha_ni job
+
+ ; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func
+ %if SHA1_NI_SB_THRESHOLD_SSE >= 4 ; there are 4 lanes in sse mb
+ ; shani glue code
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+ ; lensN-len2=idx
+ sub lens0, len2
+ sub lens1, len2
+
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha1_ni_x2
+ ; state and idx are intact
+
+ %else
+ ; original mb code
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+ %endif
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 8*4 + 16*0]
+ movdqa xmm7, [rsp + 8*4 + 16*1]
+ movdqa xmm8, [rsp + 8*4 + 16*2]
+ movdqa xmm9, [rsp + 8*4 + 16*3]
+ movdqa xmm10, [rsp + 8*4 + 16*4]
+ movdqa xmm11, [rsp + 8*4 + 16*5]
+ movdqa xmm12, [rsp + 8*4 + 16*6]
+ movdqa xmm13, [rsp + 8*4 + 16*7]
+ movdqa xmm14, [rsp + 8*4 + 16*8]
+ movdqa xmm15, [rsp + 8*4 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x67452301
+H1: dd 0xefcdab89
+H2: dd 0x98badcfe
+H3: dd 0x10325476
+H4: dd 0xc3d2e1f0
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha1_mb_mgr_submit_sse_ni
+ no_sha1_mb_mgr_submit_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c
new file mode 100644
index 000000000..3925a6f4b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c
@@ -0,0 +1,159 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ int ret;
+
+ printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Random buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ SHA1(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c
new file mode 100644
index 000000000..4eeeaba0a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c
@@ -0,0 +1,202 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+ int ret;
+
+ printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sha1_mb test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sha1_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c
new file mode 100644
index 000000000..aaa52a0ff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c
@@ -0,0 +1,297 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SHA1_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA1_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+ int ret;
+
+ printf("multibinary_sha1_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_sha1 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha1_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha1_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha1_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ // Run sha1_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SHA1_BLOCK_SIZE +
+ SHA1_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SHA1_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem, HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha1_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha1_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SHA1_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha1_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c
new file mode 100644
index 000000000..6261bbf44
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c
@@ -0,0 +1,233 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+
+typedef uint32_t DigestSHA1[SHA1_DIGEST_NWORDS];
+
+#define MSGS 7
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+static DigestSHA1 expResultDigest1 =
+ { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 };
+
+static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+static DigestSHA1 expResultDigest2 =
+ { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D };
+
+static uint8_t msg3[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<";
+static DigestSHA1 expResultDigest3 =
+ { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 };
+
+static uint8_t msg4[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+static DigestSHA1 expResultDigest4 =
+ { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 };
+
+static uint8_t msg5[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?";
+static DigestSHA1 expResultDigest5 =
+ { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 };
+
+static uint8_t msg6[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+static DigestSHA1 expResultDigest6 =
+ { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 };
+
+static uint8_t msg7[] = "";
+static DigestSHA1 expResultDigest7 =
+ { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 };
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+
+static uint32_t *expResultDigest[MSGS] = {
+ expResultDigest1, expResultDigest2, expResultDigest3,
+ expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7
+};
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i], msgs[i],
+ strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sha1_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sha1_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sha1 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..bd8e5e527
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c
@@ -0,0 +1,128 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sha1_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha1_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha1" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sha1 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..0b4438d53
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,132 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SHA1_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ uint32_t nlanes;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sha1_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha1_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb shortage tests
+ for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < nlanes; i++)
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+ HASH_ENTIRE);
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha1" TEST_TYPE_STR " with %d lanes: ", nlanes);
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < nlanes; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ }
+
+ printf("Multi-buffer sha1 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm
new file mode 100644
index 000000000..d64ffe2bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm
@@ -0,0 +1,563 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA1 using AVX-512
+;; outer calling routine takes care of save and restore of XMM registers
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+
+%define IDX var1
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define KT zmm5
+%define AA zmm6
+%define BB zmm7
+%define CC zmm8
+%define DD zmm9
+%define EE zmm10
+%define TMP0 zmm11
+%define TMP1 zmm12
+%define TMP2 zmm13
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
+
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+ vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ vmovdqa32 TMP1, B ; Copy B
+ vpaddd E, E, %%WT ; E = E + Wt
+ vpternlogd TMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ vprold TMP0, A, 5 ; TMP0 = ROTL_5(A)
+ vpaddd E, E, TMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ vprold B, B, 30 ; B = ROTL_30(B)
+ vpaddd E, E, TMP0 ; E = T
+
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp2 %2
+%define %%WTp8 %3
+%define %%WTp13 %4
+ ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+ ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+ vpternlogd %%WT, %%WTp2, %%WTp8, 0x96
+ vpxord %%WT, %%WT, %%WTp13
+ vprold %%WT, %%WT, 1
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT %1
+%define %%OFFSET %2
+ mov inp0, [IN + (%%OFFSET*8)]
+ vmovups %%WT, [inp0+IDX]
+%endmacro
+
+align 64
+
+; void sha1_mb_x16_avx512(SHA1_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha1_mb_x16_avx512)
+sha1_mb_x16_avx512:
+ endbranch
+
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*64]
+ vmovups B, [DIGEST + 1*64]
+ vmovups C, [DIGEST + 2*64]
+ vmovups D, [DIGEST + 3*64]
+ vmovups E, [DIGEST + 4*64]
+
+ xor IDX, IDX
+
+ ;; transpose input onto stack
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+ vmovups W0,[inp0+IDX]
+ vmovups W1,[inp1+IDX]
+ vmovups W2,[inp2+IDX]
+ vmovups W3,[inp3+IDX]
+ vmovups W4,[inp4+IDX]
+ vmovups W5,[inp5+IDX]
+ vmovups W6,[inp6+IDX]
+ vmovups W7,[inp7+IDX]
+
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+
+ vmovups W8, [inp0+IDX]
+ vmovups W9, [inp1+IDX]
+ vmovups W10,[inp2+IDX]
+ vmovups W11,[inp3+IDX]
+ vmovups W12,[inp4+IDX]
+ vmovups W13,[inp5+IDX]
+ vmovups W14,[inp6+IDX]
+ vmovups W15,[inp7+IDX]
+
+lloop:
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ add IDX, 64
+
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; Save digests for later addition
+ vmovdqa32 AA, A
+ vmovdqa32 BB, B
+ vmovdqa32 CC, C
+ vmovdqa32 DD, D
+ vmovdqa32 EE, E
+
+ vmovdqa32 KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 64
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %if N = 19
+ vmovdqa32 KT, [K20_39]
+ %assign I 0x96
+ %elif N = 39
+ vmovdqa32 KT, [K40_59]
+ %assign I 0xE8
+ %elif N = 59
+ vmovdqa32 KT, [K60_79]
+ %assign I 0x96
+ %endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+ ; Check if this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ jmp lloop
+
+lastLoop:
+; Need to reset argument rotation values to Round 64 values
+%xdefine TMP_ A
+%xdefine A B
+%xdefine B C
+%xdefine C D
+%xdefine D E
+%xdefine E TMP_
+
+ ; Process last 16 rounds
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovups [DIGEST + 0*64], A
+ vmovups [DIGEST + 1*64], B
+ vmovups [DIGEST + 2*64], C
+ vmovups [DIGEST + 3*64], D
+ vmovups [DIGEST + 4*64], E
+
+ ret
+
+section .data
+align 64
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_x16_avx512
+no_sha1_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm
new file mode 100644
index 000000000..eb67309da
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm
@@ -0,0 +1,416 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA1 using AVX
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = ((B & C) | ((~ B) & D) )
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpand %%regF, %%regB,%%regC
+ vpandn %%regT, %%regB,%%regD
+ vpor %%regF, %%regT,%%regF
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-(%%imm))
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-(%%imm))
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 16)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define XMM_SAVE ((15-15)*16 + 1*8)
+%define FRAMESZ 16*16 + XMM_SAVE
+%define _XMM FRAMESZ - XMM_SAVE
+
+%define VMOVPS vmovups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX rax
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (4*5*4)
+
+;%ifdef LINUX
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define ARG1 rdi
+ %define ARG2 rsi
+%else
+ ; Windows
+ %define ARG1 rcx
+ %define ARG2 rdx
+%endif
+
+align 32
+
+; void sha1_mb_x4_avx(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
+; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used)
+; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
+;
+; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
+;
+mk_global sha1_mb_x4_avx, function, internal
+sha1_mb_x4_avx:
+ endbranch
+
+ sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8
+
+ ;; Initialize digests
+ vmovdqa A, [ARG1 + 0*16]
+ vmovdqa B, [ARG1 + 1*16]
+ vmovdqa C, [ARG1 + 2*16]
+ vmovdqa D, [ARG1 + 3*16]
+ vmovdqa E, [ARG1 + 4*16]
+
+ ;; load input pointers
+ mov inp0,[ARG1 + _data_ptr + 0*8]
+ mov inp1,[ARG1 + _data_ptr + 1*8]
+ mov inp2,[ARG1 + _data_ptr + 2*8]
+ mov inp3,[ARG1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+ VMOVPS T2,[inp0+IDX]
+ VMOVPS T1,[inp1+IDX]
+ VMOVPS T4,[inp2+IDX]
+ VMOVPS T3,[inp3+IDX]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vpshufb T0, T0, F
+ vmovdqa [rsp+(I*4+0)*16],T0
+ vpshufb T1, T1, F
+ vmovdqa [rsp+(I*4+1)*16],T1
+ vpshufb T2, T2, F
+ vmovdqa [rsp+(I*4+2)*16],T2
+ vpshufb T3, T3, F
+ vmovdqa [rsp+(I*4+3)*16],T3
+ add IDX, 4*4
+%assign I (I+1)
+%endrep
+
+ ; save old digests
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [rsp + ((16 - 16) & 15) * 16]
+ vmovdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub ARG2, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqa [ARG1 + 0*16], A
+ vmovdqa [ARG1 + 1*16], B
+ vmovdqa [ARG1 + 2*16], C
+ vmovdqa [ARG1 + 3*16], D
+ vmovdqa [ARG1 + 4*16], E
+
+ ; update input pointers
+ add inp0, IDX
+ mov [ARG1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [ARG1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [ARG1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [ARG1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+
+ ret
+
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm
new file mode 100644
index 000000000..5677dce73
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm
@@ -0,0 +1,413 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA1 using SSE
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regC
+ pxor %%regF,%%regD
+ pand %%regF,%%regB
+ pxor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regD
+ pxor %%regF,%%regC
+ pxor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regB
+ movdqa %%regT,%%regB
+ por %%regF,%%regC
+ pand %%regT,%%regC
+ pand %%regF,%%regD
+ por %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ pslld %%reg, %%imm
+ psrld %%tmp, (32-%%imm)
+ por %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ paddd %%regE,%%immCNT
+ paddd %%regE,[rsp + (%%memW * 16)]
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ paddd %%regE,%%immCNT
+ movdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [rsp + ((%%memW - 8) & 15) * 16]
+ pxor W16, [rsp + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+ ROTATE_W
+
+ movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define XMM_SAVE ((15-15)*16 + 1*8)
+%define FRAMESZ 16*16 + XMM_SAVE
+%define _XMM FRAMESZ - XMM_SAVE
+
+%define MOVPS movups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX rax
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (4*5*4)
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define ARG1 rdi
+ %define ARG2 rsi
+%else
+ ; Windows
+ %define ARG1 rcx
+ %define ARG2 rdx
+%endif
+
+align 32
+
+; void sha1_mb_x4_sse(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
+; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used)
+; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
+;
+; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
+;
+mk_global sha1_mb_x4_sse, function, internal
+sha1_mb_x4_sse:
+ endbranch
+
+ sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8
+
+ ;; Initialize digests
+ movdqa A, [ARG1 + 0*16]
+ movdqa B, [ARG1 + 1*16]
+ movdqa C, [ARG1 + 2*16]
+ movdqa D, [ARG1 + 3*16]
+ movdqa E, [ARG1 + 4*16]
+
+ ;; load input pointers
+ mov inp0,[ARG1 + _data_ptr + 0*8]
+ mov inp1,[ARG1 + _data_ptr + 1*8]
+ mov inp2,[ARG1 + _data_ptr + 2*8]
+ mov inp3,[ARG1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+ MOVPS T2,[inp0+IDX]
+ MOVPS T1,[inp1+IDX]
+ MOVPS T4,[inp2+IDX]
+ MOVPS T3,[inp3+IDX]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ pshufb T0, F
+ movdqa [rsp+(I*4+0)*16],T0
+ pshufb T1, F
+ movdqa [rsp+(I*4+1)*16],T1
+ pshufb T2, F
+ movdqa [rsp+(I*4+2)*16],T2
+ pshufb T3, F
+ movdqa [rsp+(I*4+3)*16],T3
+ add IDX, 4*4
+%assign I (I+1)
+%endrep
+
+ ; save old digests
+ movdqa AA, A
+ movdqa BB, B
+ movdqa CC, C
+ movdqa DD, D
+ movdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ movdqa K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ movdqa W16, [rsp + ((16 - 16) & 15) * 16]
+ movdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ movdqa K, [K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ movdqa K, [K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ movdqa K, [K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ paddd A,AA
+ paddd B,BB
+ paddd C,CC
+ paddd D,DD
+ paddd E,EE
+
+ sub ARG2, 1
+ jne lloop
+
+ ; write out digests
+ movdqa [ARG1 + 0*16], A
+ movdqa [ARG1 + 1*16], B
+ movdqa [ARG1 + 2*16], C
+ movdqa [ARG1 + 3*16], D
+ movdqa [ARG1 + 4*16], E
+
+ ; update input pointers
+ add inp0, IDX
+ mov [ARG1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [ARG1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [ARG1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [ARG1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+
+ ret
+
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm
new file mode 100644
index 000000000..edcba6d3f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm
@@ -0,0 +1,518 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA1 using SSE-256
+;; outer calling routine takes care of save and restore of XMM registers
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; F0 = ((B & C) | ((~B) & D))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpand %%regF, %%regB,%%regC
+ vpandn %%regT, %%regB,%%regD
+ vpor %%regF, %%regT,%%regF
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-%%imm)
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 32)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqu W14, [rsp + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqu [rsp + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define YMM_SAVE (15-15)*32
+%define FRAMESZ 32*16 + 0*8 + YMM_SAVE
+%define _YMM FRAMESZ - YMM_SAVE
+
+%define VMOVPS vmovups
+
+%define IDX rax
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define inp7 rcx
+ %define arg1 rdi
+ %define arg2 rsi
+ %define RSP_SAVE rdx
+%else
+ %define inp7 rdi
+ %define arg1 rcx
+ %define arg2 rdx
+ %define RSP_SAVE rsi
+%endif
+
+
+; ymm0 A
+; ymm1 B
+; ymm2 C
+; ymm3 D
+; ymm4 E
+; ymm5 F AA
+; ymm6 T0 BB
+; ymm7 T1 CC
+; ymm8 T2 DD
+; ymm9 T3 EE
+; ymm10 T4 TMP
+; ymm11 T5 FUN
+; ymm12 T6 K
+; ymm13 T7 W14
+; ymm14 T8 W15
+; ymm15 T9 W16
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+
+%define F ymm5
+%define T0 ymm6
+%define T1 ymm7
+%define T2 ymm8
+%define T3 ymm9
+%define T4 ymm10
+%define T5 ymm11
+%define T6 ymm12
+%define T7 ymm13
+%define T8 ymm14
+%define T9 ymm15
+
+%define AA ymm5
+%define BB ymm6
+%define CC ymm7
+%define DD ymm8
+%define EE ymm9
+%define TMP ymm10
+%define FUN ymm11
+%define K ymm12
+%define W14 ymm13
+%define W15 ymm14
+%define W16 ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (8*5*4) ; 8 streams x 5 32bit words per digest x 4 bytes per word
+
+align 32
+
+; void sha1_x8_avx2(SHA1_MB_ARGS_X8, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+mk_global sha1_mb_x8_avx2, function, internal
+sha1_mb_x8_avx2:
+ endbranch
+
+ push RSP_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+ sub rsp, FRAMESZ ;; FRAMESZ + pushes must be even multiple of 8
+
+ ; align rsp to 32 Bytes
+ and rsp, ~0x1F
+
+ ;; Initialize digests
+ vmovdqu A, [arg1 + 0*32]
+ vmovdqu B, [arg1 + 1*32]
+ vmovdqu C, [arg1 + 2*32]
+ vmovdqu D, [arg1 + 3*32]
+ vmovdqu E, [arg1 + 4*32]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1+_data_ptr+0*8]
+ mov inp1,[arg1+_data_ptr+1*8]
+ mov inp2,[arg1+_data_ptr+2*8]
+ mov inp3,[arg1+_data_ptr+3*8]
+ mov inp4,[arg1+_data_ptr+4*8]
+ mov inp5,[arg1+_data_ptr+5*8]
+ mov inp6,[arg1+_data_ptr+6*8]
+ mov inp7,[arg1+_data_ptr+7*8]
+
+ xor IDX, IDX
+lloop:
+ vmovdqu F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 2
+ VMOVPS T0,[inp0+IDX]
+ VMOVPS T1,[inp1+IDX]
+ VMOVPS T2,[inp2+IDX]
+ VMOVPS T3,[inp3+IDX]
+ VMOVPS T4,[inp4+IDX]
+ VMOVPS T5,[inp5+IDX]
+ VMOVPS T6,[inp6+IDX]
+ VMOVPS T7,[inp7+IDX]
+ TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ vpshufb T0, T0, F
+ vmovdqu [rsp+(I*8+0)*32],T0
+ vpshufb T1, T1, F
+ vmovdqu [rsp+(I*8+1)*32],T1
+ vpshufb T2, T2, F
+ vmovdqu [rsp+(I*8+2)*32],T2
+ vpshufb T3, T3, F
+ vmovdqu [rsp+(I*8+3)*32],T3
+ vpshufb T4, T4, F
+ vmovdqu [rsp+(I*8+4)*32],T4
+ vpshufb T5, T5, F
+ vmovdqu [rsp+(I*8+5)*32],T5
+ vpshufb T6, T6, F
+ vmovdqu [rsp+(I*8+6)*32],T6
+ vpshufb T7, T7, F
+ vmovdqu [rsp+(I*8+7)*32],T7
+ add IDX, 32
+%assign I (I+1)
+%endrep
+
+
+ ; save old digests
+ vmovdqu AA, A
+ vmovdqu BB, B
+ vmovdqu CC, C
+ vmovdqu DD, D
+ vmovdqu EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqu K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqu W16, [rsp + ((16 - 16) & 15) * 32]
+ vmovdqu W15, [rsp + ((16 - 15) & 15) * 32]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqu K, [K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqu K, [K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqu K, [K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub arg2, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqu [arg1 + 0*32], A
+ vmovdqu [arg1 + 1*32], B
+ vmovdqu [arg1 + 2*32], C
+ vmovdqu [arg1 + 3*32], D
+ vmovdqu [arg1 + 4*32], E
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1+_data_ptr+0*8], inp0
+ mov [arg1+_data_ptr+1*8], inp1
+ mov [arg1+_data_ptr+2*8], inp2
+ mov [arg1+_data_ptr+3*8], inp3
+ mov [arg1+_data_ptr+4*8], inp4
+ mov [arg1+_data_ptr+5*8], inp5
+ mov [arg1+_data_ptr+6*8], inp6
+ mov [arg1+_data_ptr+7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ mov rsp, RSP_SAVE
+
+ pop RSP_SAVE
+ ret
+
+
+
+section .data align=32
+
+align 32
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c
new file mode 100644
index 000000000..e778c5d98
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c
@@ -0,0 +1,112 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Test messages
+#define TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+uint8_t msg3[] = TST_STR TST_STR "0123456789:;<";
+uint8_t msg4[] = TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+uint8_t msg5[] = TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?";
+uint8_t msg6[] =
+ TST_STR TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+uint8_t msg7[] = "";
+
+// Expected digests
+uint32_t dgst1[] = { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 };
+uint32_t dgst2[] = { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D };
+uint32_t dgst3[] = { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 };
+uint32_t dgst4[] = { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 };
+uint32_t dgst5[] = { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 };
+uint32_t dgst6[] = { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 };
+uint32_t dgst7[] = { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 };
+
+uint8_t *msgs[] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+uint32_t *expected_digest[] = { dgst1, dgst2, dgst3, dgst4, dgst5, dgst6, dgst7 };
+
+int check_job(uint32_t * ref, uint32_t * good, int words)
+{
+ int i;
+ for (i = 0; i < words; i++)
+ if (good[i] != ref[i])
+ return 1;
+
+ return 0;
+}
+
+#define MAX_MSGS 7
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[MAX_MSGS];
+ SHA1_HASH_CTX *p_job;
+ int i, checked = 0, failed = 0;
+ int n = sizeof(msgs) / sizeof(msgs[0]);
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+ // Initialize multi-buffer manager
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < n; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)expected_digest[i];
+
+ p_job = sha1_ctx_mgr_submit(mgr, &ctxpool[i], msgs[i],
+ strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (p_job) { // If we have finished a job, process it
+ checked++;
+ failed +=
+ check_job(p_job->job.result_digest, p_job->user_data,
+ SHA1_DIGEST_NWORDS);
+ }
+ }
+
+ // Finish remaining jobs
+ while (NULL != (p_job = sha1_ctx_mgr_flush(mgr))) {
+ checked++;
+ failed +=
+ check_job(p_job->job.result_digest, p_job->user_data, SHA1_DIGEST_NWORDS);
+ }
+
+ printf("Example multi-buffer sha1 completed=%d, failed=%d\n", checked, failed);
+ return failed;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm
new file mode 100644
index 000000000..c205f2389
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm
@@ -0,0 +1,131 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha1_ctx_mgr_init_sse
+extern sha1_ctx_mgr_submit_sse
+extern sha1_ctx_mgr_flush_sse
+
+extern sha1_ctx_mgr_init_avx
+extern sha1_ctx_mgr_submit_avx
+extern sha1_ctx_mgr_flush_avx
+
+extern sha1_ctx_mgr_init_avx2
+extern sha1_ctx_mgr_submit_avx2
+extern sha1_ctx_mgr_flush_avx2
+
+extern sha1_ctx_mgr_init_base
+extern sha1_ctx_mgr_submit_base
+extern sha1_ctx_mgr_flush_base
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha1_ctx_mgr_init_avx512
+ extern sha1_ctx_mgr_submit_avx512
+ extern sha1_ctx_mgr_flush_avx512
+%endif
+
+%ifdef HAVE_AS_KNOWS_SHANI
+ extern sha1_ctx_mgr_init_sse_ni
+ extern sha1_ctx_mgr_submit_sse_ni
+ extern sha1_ctx_mgr_flush_sse_ni
+%endif
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+ extern sha1_ctx_mgr_init_avx512_ni
+ extern sha1_ctx_mgr_submit_avx512_ni
+ extern sha1_ctx_mgr_flush_avx512_ni
+ %endif
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha1_ctx_mgr_init
+mbin_interface sha1_ctx_mgr_submit
+mbin_interface sha1_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6's extension through replacing base by sse version
+ %ifdef HAVE_AS_KNOWS_SHANI
+ mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \
+ sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \
+ sha1_ctx_mgr_init_avx512, sha1_ctx_mgr_init_sse_ni, sha1_ctx_mgr_init_avx512_ni
+ mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \
+ sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \
+ sha1_ctx_mgr_submit_avx512, sha1_ctx_mgr_submit_sse_ni, sha1_ctx_mgr_submit_avx512_ni
+ mbin_dispatch_base_to_avx512_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \
+ sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \
+ sha1_ctx_mgr_flush_avx512, sha1_ctx_mgr_flush_sse_ni, sha1_ctx_mgr_flush_avx512_ni
+ %else
+ mbin_dispatch_init6 sha1_ctx_mgr_init, sha1_ctx_mgr_init_base, \
+ sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, \
+ sha1_ctx_mgr_init_avx512
+ mbin_dispatch_init6 sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_base, \
+ sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, \
+ sha1_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_base, \
+ sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, \
+ sha1_ctx_mgr_flush_avx512
+ %endif
+%else
+ %ifdef HAVE_AS_KNOWS_SHANI
+ mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \
+ sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, sha1_ctx_mgr_init_sse_ni
+ mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \
+ sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, sha1_ctx_mgr_submit_sse_ni
+ mbin_dispatch_sse_to_avx2_shani sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \
+ sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, sha1_ctx_mgr_flush_sse_ni
+ %else
+ mbin_dispatch_init sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, \
+ sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2
+ mbin_dispatch_init sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, \
+ sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2
+ mbin_dispatch_init sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, \
+ sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2
+ %endif
+%endif
+
+;;; func core, ver, snum
+slversion sha1_ctx_mgr_init, 00, 04, 0148
+slversion sha1_ctx_mgr_submit, 00, 04, 0149
+slversion sha1_ctx_mgr_flush, 00, 04, 0150
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm
new file mode 100644
index 000000000..86d09e303
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x1.asm
@@ -0,0 +1,318 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ 32 ; space for ABCDE
+%define RSPSAVE rax
+
+%define ABCD xmm0
+; two E's b/c for ping-pong
+%define E0 xmm1
+%define E1 xmm2
+%define MSG0 xmm3
+%define MSG1 xmm4
+%define MSG2 xmm5
+%define MSG3 xmm6
+%define SHUF_MASK xmm7
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+%define IDX r8 ; local variable -- consistent with caller
+%define DPTR r11 ; local variable -- input buffer pointer
+%define TMP r9 ; local variable -- assistant to address digest
+;%define TMP2 r8 ; local variable -- assistant to address digest
+align 32
+
+; void sha1_ni_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r11, xmm0-xmm7
+;
+mk_global sha1_ni_x1, function, internal
+sha1_ni_x1:
+ endbranch
+ mov RSPSAVE, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~0xF ; Align 16Bytes downward
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz backto_mgr
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ lea TMP, [MGR + 4*IDX]
+ ;; Initialize digest
+ pinsrd ABCD, [TMP + 0*NLANX4], 3
+ pinsrd ABCD, [TMP + 1*NLANX4], 2
+ pinsrd ABCD, [TMP + 2*NLANX4], 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd ABCD, [TMP + 1*NLANX4], 0
+ pinsrd E0, [TMP + 2*NLANX4], 3
+ pand E0, [IDX3_WORD_MASK]
+
+ movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+
+ ;; Load input pointers
+ mov DPTR, [MGR + _data_ptr + IDX*8]
+ ;; nblk is used to indicate data end
+ add NBLK, DPTR
+
+lloop:
+ ; Save hash values for addition after rounds
+ movdqa [rsp + 0*16], E0
+ movdqa [rsp + 1*16], ABCD
+
+ ; do rounds 0-3
+ movdqu MSG0, [DPTR + 0*16]
+ pshufb MSG0, SHUF_MASK
+ paddd E0, MSG0
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+
+ ; do rounds 4-7
+ movdqu MSG1, [DPTR + 1*16]
+ pshufb MSG1, SHUF_MASK
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG0, MSG1
+
+ ; do rounds 8-11
+ movdqu MSG2, [DPTR + 2*16]
+ pshufb MSG2, SHUF_MASK
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ ; do rounds 12-15
+ movdqu MSG3, [DPTR + 3*16]
+ pshufb MSG3, SHUF_MASK
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ ; do rounds 16-19
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ ; do rounds 20-23
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ ; do rounds 24-27
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ ; do rounds 28-31
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ ; do rounds 32-35
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ ; do rounds 36-39
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ ; do rounds 40-43
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ ; do rounds 44-47
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ ; do rounds 48-51
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ ; do rounds 52-55
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ ; do rounds 56-59
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ ; do rounds 60-63
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 3
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ ; do rounds 64-67
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 3
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ ; do rounds 68-71
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 3
+ pxor MSG3, MSG1
+
+ ; do rounds 72-75
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 3
+
+ ; do rounds 76-79
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 3
+
+ ; Add current hash values with previously saved
+ sha1nexte E0, [rsp + 0*16]
+ paddd ABCD, [rsp + 1*16]
+
+ ; Increment data pointer and loop if more to process
+ add DPTR, 64
+ cmp DPTR, NBLK
+ jne lloop
+
+ ; write out digests
+ lea TMP, [MGR + 4*IDX]
+ pextrd [TMP + 0*NLANX4], ABCD, 3
+ pextrd [TMP + 1*NLANX4], ABCD, 2
+ pextrd [TMP + 2*NLANX4], ABCD, 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMP + 1*NLANX4], ABCD, 0
+ pextrd [TMP + 2*NLANX4], E0, 3
+
+ ; update input pointers
+ mov [MGR + _data_ptr + IDX*8], DPTR
+
+backto_mgr:
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ mov rsp, RSPSAVE
+
+ ret
+
+
+section .data align=16
+PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_ni_x1
+no_sha1_ni_x1:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm
new file mode 100644
index 000000000..7b0ddb74e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ni_x2.asm
@@ -0,0 +1,484 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ 64 ; space for ABCDE
+%define RSPSAVE rax
+
+%define ABCD xmm0
+; two E's b/c for ping-pong
+%define E0 xmm1
+%define E1 xmm2
+%define MSG0 xmm3
+%define MSG1 xmm4
+%define MSG2 xmm5
+%define MSG3 xmm6
+
+%define ABCDb xmm7
+%define E0b xmm8 ; Need two E's b/c they ping pong
+%define E1b xmm9
+%define MSG0b xmm10
+%define MSG1b xmm11
+%define MSG2b xmm12
+%define MSG3b xmm13
+
+%define SHUF_MASK xmm14
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+%define IDX r8 ; local variable -- consistent with caller
+%define DPTR r11 ; local variable -- input buffer pointer
+%define DPTRb r12 ;
+%define TMP r9 ; local variable -- assistant to address digest
+%define TMPb r13 ; local variable -- assistant to address digest
+align 32
+
+; void sha1_ni_x2(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r13, xmm0-xmm14
+;
+mk_global sha1_ni_x2, function, internal
+sha1_ni_x2:
+ endbranch
+ mov RSPSAVE, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~0xF ; Align 16Bytes downward
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz backto_mgr
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ lea TMP, [MGR + _args_digest ];
+ lea TMPb,[MGR + _args_digest + 4*1];
+
+ ;; Initialize digest
+ pinsrd ABCD, [TMP + 0*NLANX4], 3
+ pinsrd ABCD, [TMP + 1*NLANX4], 2
+ pinsrd ABCD, [TMP + 2*NLANX4], 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd ABCD, [TMP + 1*NLANX4], 0
+ pinsrd E0, [TMP + 2*NLANX4], 3
+ pand E0, [IDX3_WORD_MASK]
+
+ pinsrd ABCDb, [TMPb + 0*NLANX4], 3
+ pinsrd ABCDb, [TMPb + 1*NLANX4], 2
+ pinsrd ABCDb, [TMPb + 2*NLANX4], 1
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd ABCDb, [TMPb + 1*NLANX4], 0
+ pinsrd E0b, [TMPb + 2*NLANX4], 3
+ pand E0b, [IDX3_WORD_MASK]
+
+ movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+
+ ;; Load input pointers
+ mov DPTR, [MGR + _data_ptr ]
+ mov DPTRb,[MGR + _data_ptr + 8*1]
+ ;; nblk is used to indicate data end
+ add NBLK, DPTR
+
+lloop:
+ movdqa [rsp + 0*16], E0
+ movdqa [rsp + 1*16], ABCD
+
+ movdqa [rsp + 2*16], E0b
+ movdqa [rsp + 3*16], ABCDb
+
+ ; do rounds 0-3
+ movdqu MSG0, [DPTR + 0*16]
+ pshufb MSG0, SHUF_MASK
+ paddd E0, MSG0
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+
+ movdqu MSG0b, [DPTRb + 0*16]
+ pshufb MSG0b, SHUF_MASK
+ paddd E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+
+ ; do rounds 4-7
+ movdqu MSG1, [DPTR + 1*16]
+ pshufb MSG1, SHUF_MASK
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG0, MSG1
+
+ movdqu MSG1b, [DPTRb + 1*16]
+ pshufb MSG1b, SHUF_MASK
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG0b, MSG1b
+
+ ; do rounds 8-11
+ movdqu MSG2, [DPTR + 2*16]
+ pshufb MSG2, SHUF_MASK
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ movdqu MSG2b, [DPTRb + 2*16]
+ pshufb MSG2b, SHUF_MASK
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 12-15
+ movdqu MSG3, [DPTR + 3*16]
+ pshufb MSG3, SHUF_MASK
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 0
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ movdqu MSG3b, [DPTRb + 3*16]
+ pshufb MSG3b, SHUF_MASK
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 0
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 16-19
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 0
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 0
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 20-23
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ; do rounds 24-27
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 28-31
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 32-35
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 1
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 1
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 36-39
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 1
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 1
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ; do rounds 40-43
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 44-47
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 48-51
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 52-55
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 2
+ sha1msg1 MSG0, MSG1
+ pxor MSG3, MSG1
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 2
+ sha1msg1 MSG0b, MSG1b
+ pxor MSG3b, MSG1b
+
+ ; do rounds 56-59
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 2
+ sha1msg1 MSG1, MSG2
+ pxor MSG0, MSG2
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 2
+ sha1msg1 MSG1b, MSG2b
+ pxor MSG0b, MSG2b
+
+ ; do rounds 60-63
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1msg2 MSG0, MSG3
+ sha1rnds4 ABCD, E1, 3
+ sha1msg1 MSG2, MSG3
+ pxor MSG1, MSG3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG0b, MSG3b
+ sha1rnds4 ABCDb, E1b, 3
+ sha1msg1 MSG2b, MSG3b
+ pxor MSG1b, MSG3b
+
+ ; do rounds 64-67
+ sha1nexte E0, MSG0
+ movdqa E1, ABCD
+ sha1msg2 MSG1, MSG0
+ sha1rnds4 ABCD, E0, 3
+ sha1msg1 MSG3, MSG0
+ pxor MSG2, MSG0
+
+ sha1nexte E0b, MSG0b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG1b, MSG0b
+ sha1rnds4 ABCDb, E0b, 3
+ sha1msg1 MSG3b, MSG0b
+ pxor MSG2b, MSG0b
+
+ ; do rounds 68-71
+ sha1nexte E1, MSG1
+ movdqa E0, ABCD
+ sha1msg2 MSG2, MSG1
+ sha1rnds4 ABCD, E1, 3
+ pxor MSG3, MSG1
+
+ sha1nexte E1b, MSG1b
+ movdqa E0b, ABCDb
+ sha1msg2 MSG2b, MSG1b
+ sha1rnds4 ABCDb, E1b, 3
+ pxor MSG3b, MSG1b
+
+ ; do rounds 72-75
+ sha1nexte E0, MSG2
+ movdqa E1, ABCD
+ sha1msg2 MSG3, MSG2
+ sha1rnds4 ABCD, E0, 3
+
+ sha1nexte E0b, MSG2b
+ movdqa E1b, ABCDb
+ sha1msg2 MSG3b, MSG2b
+ sha1rnds4 ABCDb, E0b, 3
+
+ ; do rounds 76-79
+ sha1nexte E1, MSG3
+ movdqa E0, ABCD
+ sha1rnds4 ABCD, E1, 3
+
+ sha1nexte E1b, MSG3b
+ movdqa E0b, ABCDb
+ sha1rnds4 ABCDb, E1b, 3
+
+ ; Add current hash values with previously saved
+ sha1nexte E0, [rsp + 0*16]
+ paddd ABCD, [rsp + 1*16]
+
+ sha1nexte E0b, [rsp + 2*16]
+ paddd ABCDb, [rsp + 3*16]
+
+ ; Increment data pointer and loop if more to process
+ add DPTR, 64
+ add DPTRb, 64
+ cmp DPTR, NBLK
+ jne lloop
+
+ ; write out digests
+ lea TMP, [MGR + _args_digest]
+ pextrd [TMP + 0*NLANX4], ABCD, 3
+ pextrd [TMP + 1*NLANX4], ABCD, 2
+ pextrd [TMP + 2*NLANX4], ABCD, 1
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMP + 1*NLANX4], ABCD, 0
+ pextrd [TMP + 2*NLANX4], E0, 3
+
+ lea TMPb, [MGR +_args_digest + 4*1]
+ pextrd [TMPb + 0*NLANX4], ABCDb, 3
+ pextrd [TMPb + 1*NLANX4], ABCDb, 2
+ pextrd [TMPb + 2*NLANX4], ABCDb, 1
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMPb + 1*NLANX4], ABCDb, 0
+ pextrd [TMPb + 2*NLANX4], E0b, 3
+
+ ; update input pointers
+ mov [MGR + _data_ptr], DPTR
+ mov [MGR + _data_ptr + 8*1], DPTRb
+
+backto_mgr:
+;;;;;;;;;;;;;;;;
+;; Postamble
+
+ mov rsp, RSPSAVE
+
+ ret
+
+section .data align=16
+PSHUFFLE_SHANI_MASK: dq 0x08090a0b0c0d0e0f, 0x0001020304050607
+IDX3_WORD_MASK: dq 0x0000000000000000, 0xFFFFFFFF00000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_ni_x2
+no_sha1_ni_x2:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm
new file mode 100644
index 000000000..aeb00a008
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_opt_x1.asm
@@ -0,0 +1,485 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+_GPR_SAVE_SIZE equ 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15
+_WK_SAVE_SIZE equ 16*4
+
+_WK_SAVE equ 0
+_GPR_SAVE equ _WK_SAVE + _WK_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+; rax~rdx, rsi, rdi, rbp are used for RR
+%define N_MGR r8
+%define IDX r9 ; local variable -- consistent with caller
+%define K_BASE r11
+%define BUFFER_PTR r12
+%define BUFFER_END r13
+%define TMP r14 ; local variable -- assistant to address digest
+
+%xdefine W_TMP xmm0
+%xdefine W_TMP2 xmm9
+
+%xdefine W0 xmm1
+%xdefine W4 xmm2
+%xdefine W8 xmm3
+%xdefine W12 xmm4
+%xdefine W16 xmm5
+%xdefine W20 xmm6
+%xdefine W24 xmm7
+%xdefine W28 xmm8
+
+%xdefine XMM_SHUFB_BSWAP xmm10
+
+;; we keep window of 64 w[i]+K pre-calculated values in a circular buffer
+%xdefine WK(t) (rsp + (t & 15)*4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Constants
+
+%xdefine K1 0x5a827999
+%xdefine K2 0x6ed9eba1
+%xdefine K3 0x8f1bbcdc
+%xdefine K4 0xca62c1d6
+
+%xdefine W_PRECALC_AHEAD 16
+%xdefine W_NO_TAIL_PRECALC 0
+
+; Rounds macros
+
+%macro REGALLOC 0
+ %xdefine A ecx
+ %xdefine B esi
+ %xdefine C edi
+ %xdefine D ebp
+ %xdefine E edx
+
+ %xdefine T1 eax
+ %xdefine T2 ebx
+%endmacro
+
+%macro F1 3
+ mov T1,%2
+ xor T1,%3
+ and T1,%1
+ xor T1,%3
+%endmacro
+
+%macro F2 3
+ mov T1,%3
+ xor T1,%2
+ xor T1,%1
+%endmacro
+
+%macro F3 3
+ mov T1,%2
+ mov T2,%1
+ or T1,%1
+ and T2,%2
+ and T1,%3
+ or T1,T2
+%endmacro
+
+%define F4 F2
+
+%macro UPDATE_HASH 2
+ add %2, %1
+ mov %1, %2
+%endmacro
+
+
+%macro W_PRECALC 1
+ %xdefine i (%1)
+
+ %if (i < 20)
+ %xdefine K_XMM 0
+ %elif (i < 40)
+ %xdefine K_XMM 16
+ %elif (i < 60)
+ %xdefine K_XMM 32
+ %else
+ %xdefine K_XMM 48
+ %endif
+
+ %if (i<16 || (i>=80 && i<(80 + W_PRECALC_AHEAD)))
+
+ %if (W_NO_TAIL_PRECALC == 0)
+
+ %xdefine i ((%1) % 80) ;; pre-compute for the next iteration
+
+ %if (i == 0)
+ W_PRECALC_RESET
+ %endif
+
+
+ W_PRECALC_00_15
+ %endif
+
+ %elif (i < 32)
+ W_PRECALC_16_31
+ %elif (i < 80) ;; rounds 32-79
+ W_PRECALC_32_79
+ %endif
+%endmacro
+
+%macro W_PRECALC_RESET 0
+ %xdefine W W0
+ %xdefine W_minus_04 W4
+ %xdefine W_minus_08 W8
+ %xdefine W_minus_12 W12
+ %xdefine W_minus_16 W16
+ %xdefine W_minus_20 W20
+ %xdefine W_minus_24 W24
+ %xdefine W_minus_28 W28
+ %xdefine W_minus_32 W
+%endmacro
+
+%macro W_PRECALC_ROTATE 0
+ %xdefine W_minus_32 W_minus_28
+ %xdefine W_minus_28 W_minus_24
+ %xdefine W_minus_24 W_minus_20
+ %xdefine W_minus_20 W_minus_16
+ %xdefine W_minus_16 W_minus_12
+ %xdefine W_minus_12 W_minus_08
+ %xdefine W_minus_08 W_minus_04
+ %xdefine W_minus_04 W
+ %xdefine W W_minus_32
+%endmacro
+
+%macro W_PRECALC_00_15 0
+ ;; message scheduling pre-compute for rounds 0-15
+ %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqu W_TMP, [BUFFER_PTR + (i * 4)]
+ %elif ((i & 3) == 1)
+ pshufb W_TMP, XMM_SHUFB_BSWAP
+ movdqa W, W_TMP
+ %elif ((i & 3) == 2)
+ paddd W_TMP, [K_BASE]
+ %elif ((i & 3) == 3)
+ movdqa [WK(i&~3)], W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro W_PRECALC_16_31 0
+ ;; message scheduling pre-compute for rounds 16-31
+ ;; calculating last 32 w[i] values in 8 XMM registers
+ ;; pre-calculate K+w[i] values and store to mem, for later load by ALU add instruction
+ ;;
+ ;; "brute force" vectorization for rounds 16-31 only due to w[i]->w[i-3] dependency
+ ;;
+ %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqa W, W_minus_12
+ palignr W, W_minus_16, 8 ;; w[i-14]
+ movdqa W_TMP, W_minus_04
+ psrldq W_TMP, 4 ;; w[i-3]
+ pxor W, W_minus_08
+ %elif ((i & 3) == 1)
+ pxor W_TMP, W_minus_16
+ pxor W, W_TMP
+ movdqa W_TMP2, W
+ movdqa W_TMP, W
+ pslldq W_TMP2, 12
+ %elif ((i & 3) == 2)
+ psrld W, 31
+ pslld W_TMP, 1
+ por W_TMP, W
+ movdqa W, W_TMP2
+ psrld W_TMP2, 30
+ pslld W, 2
+ %elif ((i & 3) == 3)
+ pxor W_TMP, W
+ pxor W_TMP, W_TMP2
+ movdqa W, W_TMP
+ paddd W_TMP, [K_BASE + K_XMM]
+ movdqa [WK(i&~3)],W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro W_PRECALC_32_79 0
+ ;; in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
+ ;; instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
+ ;; allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
+ ;;
+ %if ((i & 3) == 0) ;; blended SSE and ALU instruction scheduling, 1 vector iteration per 4 rounds
+ movdqa W_TMP, W_minus_04
+ pxor W, W_minus_28 ;; W is W_minus_32 before xor
+ palignr W_TMP, W_minus_08, 8
+ %elif ((i & 3) == 1)
+ pxor W, W_minus_16
+ pxor W, W_TMP
+ movdqa W_TMP, W
+ %elif ((i & 3) == 2)
+ psrld W, 30
+ pslld W_TMP, 2
+ por W_TMP, W
+ %elif ((i & 3) == 3)
+ movdqa W, W_TMP
+ paddd W_TMP, [K_BASE + K_XMM]
+ movdqa [WK(i&~3)],W_TMP
+
+ W_PRECALC_ROTATE
+ %endif
+%endmacro
+
+%macro RR 6 ;; RR does two rounds of SHA-1 back to back with W pre-calculation
+
+ ;; TEMP = A
+ ;; A = F( i, B, C, D ) + E + ROTATE_LEFT( A, 5 ) + W[i] + K(i)
+ ;; C = ROTATE_LEFT( B, 30 )
+ ;; D = C
+ ;; E = D
+ ;; B = TEMP
+
+ W_PRECALC (%6 + W_PRECALC_AHEAD)
+ F %2, %3, %4 ;; F returns result in T1
+ add %5, [WK(%6)]
+ rol %2, 30
+ mov T2, %1
+ add %4, [WK(%6 + 1)]
+ rol T2, 5
+ add %5, T1
+
+ W_PRECALC (%6 + W_PRECALC_AHEAD + 1)
+ add T2, %5
+ mov %5, T2
+ rol T2, 5
+ add %4, T2
+ F %1, %2, %3 ;; F returns result in T1
+ add %4, T1
+ rol %1, 30
+
+;; write: %1, %2
+;; rotate: %1<=%4, %2<=%5, %3<=%1, %4<=%2, %5<=%3
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: all general regs (except r15), xmm0-xmm10
+; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
+;
+mk_global sha1_opt_x1, function, internal
+sha1_opt_x1:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ mov [rsp + _GPR_SAVE + 8*3], rsi
+ ; caller has already stored XMM6~10
+%endif
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+ mov [rsp + _GPR_SAVE + 8*8], rdx
+
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz .lend
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ ;; let sha1_opt sb takes over r8~r11
+ ;; Load input pointers
+ mov N_MGR, MGR
+ mov BUFFER_PTR, [MGR + _data_ptr + IDX*8]
+ ;; nblk is used to indicate data end
+ add NBLK, BUFFER_PTR
+ mov BUFFER_END, NBLK
+
+ lea K_BASE, [K_XMM_AR]
+ movdqu XMM_SHUFB_BSWAP, [bswap_shufb_ctl]
+
+ REGALLOC
+
+ lea TMP, [N_MGR + 4*IDX]
+ ;; Initialize digest
+ mov A, [TMP + 0*NLANX4]
+ mov B, [TMP + 1*NLANX4]
+ mov C, [TMP + 2*NLANX4]
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ mov D, [TMP + 1*NLANX4]
+ mov E, [TMP + 2*NLANX4]
+
+ %assign i 0
+ %rep W_PRECALC_AHEAD
+ W_PRECALC i
+ %assign i i+1
+ %endrep
+
+ %xdefine F F1
+
+.lloop:
+ cmp BUFFER_PTR, K_BASE ;; we use K_BASE value as a signal of a last block,
+ jne .lbegin ;; it is set below by: cmovae BUFFER_PTR, K_BASE
+ jmp .lend
+
+.lbegin:
+ RR A,B,C,D,E,0
+ RR D,E,A,B,C,2
+ RR B,C,D,E,A,4
+ RR E,A,B,C,D,6
+ RR C,D,E,A,B,8
+
+ RR A,B,C,D,E,10
+ RR D,E,A,B,C,12
+ RR B,C,D,E,A,14
+ RR E,A,B,C,D,16
+ RR C,D,E,A,B,18
+
+ %xdefine F F2
+
+ RR A,B,C,D,E,20
+ RR D,E,A,B,C,22
+ RR B,C,D,E,A,24
+ RR E,A,B,C,D,26
+ RR C,D,E,A,B,28
+
+ RR A,B,C,D,E,30
+ RR D,E,A,B,C,32
+ RR B,C,D,E,A,34
+ RR E,A,B,C,D,36
+ RR C,D,E,A,B,38
+
+ %xdefine F F3
+
+ RR A,B,C,D,E,40
+ RR D,E,A,B,C,42
+ RR B,C,D,E,A,44
+ RR E,A,B,C,D,46
+ RR C,D,E,A,B,48
+
+ RR A,B,C,D,E,50
+ RR D,E,A,B,C,52
+ RR B,C,D,E,A,54
+ RR E,A,B,C,D,56
+ RR C,D,E,A,B,58
+
+ %xdefine F F4
+
+ add BUFFER_PTR, 64 ;; move to next 64-byte block
+ cmp BUFFER_PTR, BUFFER_END ;; check if current block is the last one
+ cmovae BUFFER_PTR, K_BASE ;; smart way to signal the last iteration
+
+ RR A,B,C,D,E,60
+ RR D,E,A,B,C,62
+ RR B,C,D,E,A,64
+ RR E,A,B,C,D,66
+ RR C,D,E,A,B,68
+
+ RR A,B,C,D,E,70
+ RR D,E,A,B,C,72
+ RR B,C,D,E,A,74
+ RR E,A,B,C,D,76
+ RR C,D,E,A,B,78
+
+ lea TMP, [N_MGR + 4*IDX]
+ UPDATE_HASH [TMP + 0*NLANX4],A
+ UPDATE_HASH [TMP + 1*NLANX4],B
+ UPDATE_HASH [TMP + 2*NLANX4],C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ UPDATE_HASH [TMP + 1*NLANX4],D
+ UPDATE_HASH [TMP + 2*NLANX4],E
+
+ jmp .lloop
+
+ .lend:
+ mov MGR, N_MGR
+
+ mov rdx, [rsp + _GPR_SAVE + 8*8]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rsi, [rsp + _GPR_SAVE + 8*3]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbp, [rsp + _GPR_SAVE + 8*1]
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+
+;;----------------------
+section .data align=64
+
+align 128
+K_XMM_AR:
+ DD K1, K1, K1, K1
+ DD K2, K2, K2, K2
+ DD K3, K3, K3, K3
+ DD K4, K4, K4, K4
+
+align 16
+bswap_shufb_ctl:
+ DD 00010203h
+ DD 04050607h
+ DD 08090a0bh
+ DD 0c0d0e0fh
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c
new file mode 100644
index 000000000..e82fb30fe
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c
@@ -0,0 +1,220 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA1 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define H0 0x67452301
+#define H1 0xefcdab89
+#define H2 0x98badcfe
+#define H3 0x10325476
+#define H4 0xc3d2e1f0
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+ if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ else W(i) = to_be32(ww[i]); \
+ e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+ b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+ b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+ b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+ b = rol32(b,30)
+
+static void OPT_FIX sha1_single(const uint8_t * data, uint32_t digest[]);
+
+void sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA1_BLOCK_SIZE];
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+ digest[4] = H4;
+
+ i = len;
+ while (i >= SHA1_BLOCK_SIZE) {
+ sha1_single(input_data, digest);
+ input_data += SHA1_BLOCK_SIZE;
+ i -= SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA1_BLOCK_SIZE;
+ else
+ i = SHA1_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha1_single(buf, digest);
+ if (i == (2 * SHA1_BLOCK_SIZE))
+ sha1_single(buf + SHA1_BLOCK_SIZE, digest);
+}
+
+void sha1_single(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e;
+ uint32_t w[16] = { 0 };
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+
+ step00_19(0, a, b, c, d, e);
+ step00_19(1, e, a, b, c, d);
+ step00_19(2, d, e, a, b, c);
+ step00_19(3, c, d, e, a, b);
+ step00_19(4, b, c, d, e, a);
+ step00_19(5, a, b, c, d, e);
+ step00_19(6, e, a, b, c, d);
+ step00_19(7, d, e, a, b, c);
+ step00_19(8, c, d, e, a, b);
+ step00_19(9, b, c, d, e, a);
+ step00_19(10, a, b, c, d, e);
+ step00_19(11, e, a, b, c, d);
+ step00_19(12, d, e, a, b, c);
+ step00_19(13, c, d, e, a, b);
+ step00_19(14, b, c, d, e, a);
+ step00_19(15, a, b, c, d, e);
+ step00_19(16, e, a, b, c, d);
+ step00_19(17, d, e, a, b, c);
+ step00_19(18, c, d, e, a, b);
+ step00_19(19, b, c, d, e, a);
+
+ step20_39(20, a, b, c, d, e);
+ step20_39(21, e, a, b, c, d);
+ step20_39(22, d, e, a, b, c);
+ step20_39(23, c, d, e, a, b);
+ step20_39(24, b, c, d, e, a);
+ step20_39(25, a, b, c, d, e);
+ step20_39(26, e, a, b, c, d);
+ step20_39(27, d, e, a, b, c);
+ step20_39(28, c, d, e, a, b);
+ step20_39(29, b, c, d, e, a);
+ step20_39(30, a, b, c, d, e);
+ step20_39(31, e, a, b, c, d);
+ step20_39(32, d, e, a, b, c);
+ step20_39(33, c, d, e, a, b);
+ step20_39(34, b, c, d, e, a);
+ step20_39(35, a, b, c, d, e);
+ step20_39(36, e, a, b, c, d);
+ step20_39(37, d, e, a, b, c);
+ step20_39(38, c, d, e, a, b);
+ step20_39(39, b, c, d, e, a);
+
+ step40_59(40, a, b, c, d, e);
+ step40_59(41, e, a, b, c, d);
+ step40_59(42, d, e, a, b, c);
+ step40_59(43, c, d, e, a, b);
+ step40_59(44, b, c, d, e, a);
+ step40_59(45, a, b, c, d, e);
+ step40_59(46, e, a, b, c, d);
+ step40_59(47, d, e, a, b, c);
+ step40_59(48, c, d, e, a, b);
+ step40_59(49, b, c, d, e, a);
+ step40_59(50, a, b, c, d, e);
+ step40_59(51, e, a, b, c, d);
+ step40_59(52, d, e, a, b, c);
+ step40_59(53, c, d, e, a, b);
+ step40_59(54, b, c, d, e, a);
+ step40_59(55, a, b, c, d, e);
+ step40_59(56, e, a, b, c, d);
+ step40_59(57, d, e, a, b, c);
+ step40_59(58, c, d, e, a, b);
+ step40_59(59, b, c, d, e, a);
+
+ step60_79(60, a, b, c, d, e);
+ step60_79(61, e, a, b, c, d);
+ step60_79(62, d, e, a, b, c);
+ step60_79(63, c, d, e, a, b);
+ step60_79(64, b, c, d, e, a);
+ step60_79(65, a, b, c, d, e);
+ step60_79(66, e, a, b, c, d);
+ step60_79(67, d, e, a, b, c);
+ step60_79(68, c, d, e, a, b);
+ step60_79(69, b, c, d, e, a);
+ step60_79(70, a, b, c, d, e);
+ step60_79(71, e, a, b, c, d);
+ step60_79(72, d, e, a, b, c);
+ step60_79(73, c, d, e, a, b);
+ step60_79(74, b, c, d, e, a);
+ step60_79(75, a, b, c, d, e);
+ step60_79(76, e, a, b, c, d);
+ step60_79(77, d, e, a, b, c);
+ step60_79(78, c, d, e, a, b);
+ step60_79(79, b, c, d, e, a);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
new file mode 100644
index 000000000..9405c2469
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
@@ -0,0 +1,127 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += sha256_mb/sha256_ctx_sse.c \
+ sha256_mb/sha256_ctx_avx.c \
+ sha256_mb/sha256_ctx_avx2.c \
+ sha256_mb/sha256_ctx_base.c
+
+lsrc_x86_64 += sha256_mb/sha256_mb_mgr_init_sse.c \
+ sha256_mb/sha256_mb_mgr_init_avx2.c
+
+
+lsrc_x86_64 += sha256_mb/sha256_mb_mgr_submit_sse.asm \
+ sha256_mb/sha256_mb_mgr_submit_avx.asm \
+ sha256_mb/sha256_mb_mgr_submit_avx2.asm \
+ sha256_mb/sha256_mb_mgr_flush_sse.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx2.asm \
+ sha256_mb/sha256_mb_x4_sse.asm \
+ sha256_mb/sha256_mb_x4_avx.asm \
+ sha256_mb/sha256_mb_x8_avx2.asm \
+ sha256_mb/sha256_multibinary.asm
+
+lsrc_x86_64 += sha256_mb/sha256_ctx_avx512.c \
+ sha256_mb/sha256_mb_mgr_init_avx512.c \
+ sha256_mb/sha256_mb_mgr_submit_avx512.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx512.asm \
+ sha256_mb/sha256_mb_x16_avx512.asm
+
+lsrc_x86_64 += sha256_mb/sha256_opt_x1.asm
+
+lsrc_x86_64 += sha256_mb/sha256_ni_x1.asm \
+ sha256_mb/sha256_ni_x2.asm \
+ sha256_mb/sha256_ctx_sse_ni.c \
+ sha256_mb/sha256_ctx_avx512_ni.c \
+ sha256_mb/sha256_mb_mgr_submit_sse_ni.asm \
+ sha256_mb/sha256_mb_mgr_flush_sse_ni.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += sha256_mb/sha256_ctx_base.c \
+ sha256_mb/sha256_ref.c
+
+lsrc_aarch64 += sha256_mb/aarch64/sha256_mb_multibinary.S \
+ sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c \
+ sha256_mb/aarch64/sha256_ctx_ce.c \
+ sha256_mb/aarch64/sha256_mb_mgr_ce.c \
+ sha256_mb/aarch64/sha256_mb_x1_ce.S \
+ sha256_mb/aarch64/sha256_mb_x2_ce.S \
+ sha256_mb/aarch64/sha256_mb_x3_ce.S \
+ sha256_mb/aarch64/sha256_mb_x4_ce.S
+
+
+lsrc_base_aliases += sha256_mb/sha256_ctx_base_aliases.c \
+ sha256_mb/sha256_ctx_base.c \
+ sha256_mb/sha256_ref.c
+
+src_include += -I $(srcdir)/sha256_mb
+
+extern_hdrs += include/sha256_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ include/multibinary.asm \
+ sha256_mb/sha256_job.asm \
+ sha256_mb/sha256_mb_mgr_datastruct.asm \
+ include/reg_sizes.asm \
+ sha256_mb/sha256_ref.c \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h
+
+check_tests += sha256_mb/sha256_mb_test \
+ sha256_mb/sha256_mb_rand_test \
+ sha256_mb/sha256_mb_rand_update_test \
+ sha256_mb/sha256_mb_flush_test
+
+unit_tests += sha256_mb/sha256_mb_rand_ssl_test
+
+perf_tests += sha256_mb/sha256_mb_vs_ossl_perf \
+ sha256_mb/sha256_mb_vs_ossl_shortage_perf
+
+sha256_mb_rand_ssl_test: sha256_ref.o
+sha256_mb_rand_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_update_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_update_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_flush_test: sha256_ref.o
+sha256_mb_sha256_mb_flush_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha256_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sha256_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c
new file mode 100644
index 000000000..4776f55bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_ctx_ce.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state);
+SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job);
+SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_ce(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_ce(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_ce(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_ce(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_ce(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_ce_slver_02020142;
+struct slver sha256_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_submit_ce_slver_02020143;
+struct slver sha256_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_flush_ce_slver_02020144;
+struct slver sha256_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..8627991c3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_aarch64_dispatcher.c
@@ -0,0 +1,59 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_submit)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(sha256_ctx_mgr_submit_ce);
+
+ return PROVIDER_BASIC(sha256_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_init)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(sha256_ctx_mgr_init_ce);
+
+ return PROVIDER_BASIC(sha256_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha256_ctx_mgr_flush)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA2)
+ return PROVIDER_INFO(sha256_ctx_mgr_flush_ce);
+
+ return PROVIDER_BASIC(sha256_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c
new file mode 100644
index 000000000..aa63c4dd8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_mgr_ce.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha256_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SHA256_MB_CE_MAX_LANES 3
+
+#if SHA256_MB_CE_MAX_LANES >=4
+void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+#endif
+#if SHA256_MB_CE_MAX_LANES >=3
+void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+#endif
+#if SHA256_MB_CE_MAX_LANES >=2
+void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int);
+#endif
+void sha256_mb_ce_x1(SHA256_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sha256_mb_mgr_init_ce(SHA256_MB_JOB_MGR * state)
+{
+ int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = SHA256_MB_CE_MAX_LANES - 1; i >= 0; i--) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > SHA1_MB_CE_MAX_LANES is invalid lane
+ for (i = SHA256_MB_CE_MAX_LANES; i < SHA256_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sha256_mb_mgr_do_jobs(SHA256_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i, lanes;
+
+ int lane_idx_array[SHA256_MAX_LANES];
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+#if SHA256_MB_CE_MAX_LANES == 4
+ if (state->num_lanes_inuse == 4) {
+ len = min(min(state->lens[0], state->lens[1]),
+ min(state->lens[2], state->lens[3]));
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sha256_mb_ce_x4(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane,
+ state->ldata[3].job_in_lane, len >> 4);
+
+ } else
+#elif SHA256_MB_CE_MAX_LANES == 3
+ if (state->num_lanes_inuse == 3) {
+ len = min(min(state->lens[0], state->lens[1]), state->lens[2]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sha256_mb_ce_x3(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane, len >> 4);
+
+ } else
+#elif SHA256_MB_CE_MAX_LANES == 2
+ if (state->num_lanes_inuse == 2) {
+ len = min(state->lens[0], state->lens[1]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sha256_mb_ce_x2(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane, len >> 4);
+
+ } else
+#endif
+ {
+ lanes = 0, len = 0;
+ for (i = 0; i < SHA256_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ if (lanes)
+ len = min(len, state->lens[i]);
+ else
+ len = state->lens[i];
+ lane_idx_array[lanes] = i;
+ lanes++;
+ }
+ }
+ if (lanes == 0)
+ return -1;
+ lane_idx = len & 0xf;
+ len = len & (~0xf);
+#if SHA256_MB_CE_MAX_LANES >=4
+ if (lanes == 4) {
+ sha256_mb_ce_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane,
+ state->ldata[lane_idx_array[3]].job_in_lane, len >> 4);
+
+ } else
+#endif
+#if SHA256_MB_CE_MAX_LANES >=3
+ if (lanes == 3) {
+ sha256_mb_ce_x3(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane, len >> 4);
+ } else
+#endif
+#if SHA256_MB_CE_MAX_LANES >=2
+ if (lanes == 2) {
+ sha256_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+ } else
+#endif
+ {
+ sha256_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+ }
+ }
+ //only return the min length job
+ for (i = 0; i < SHA256_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+
+ return lane_idx;
+
+}
+
+static SHA256_JOB *sha256_mb_mgr_free_lane(SHA256_MB_JOB_MGR * state)
+{
+ int i;
+ SHA256_JOB *ret = NULL;
+
+ for (i = 0; i < SHA256_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sha256_mb_mgr_insert_job(SHA256_MB_JOB_MGR * state, SHA256_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ //fatal error
+ assert(lane_idx < SHA256_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SHA256_JOB *sha256_mb_mgr_submit_ce(SHA256_MB_JOB_MGR * state, SHA256_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SHA256_JOB *ret;
+
+ //add job into lanes
+ sha256_mb_mgr_insert_job(state, job);
+
+ ret = sha256_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < SHA256_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sha256_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sha256_mb_mgr_do_jobs(state);
+#endif
+
+ //~ i = lane_idx;
+ ret = sha256_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SHA256_JOB *sha256_mb_mgr_flush_ce(SHA256_MB_JOB_MGR * state)
+{
+ SHA256_JOB *ret;
+ ret = sha256_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sha256_mb_mgr_do_jobs(state);
+ return sha256_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S
new file mode 100644
index 000000000..ecc5fc5f5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include <aarch64_multibinary.h>
+
+
+mbin_interface sha256_ctx_mgr_submit
+mbin_interface sha256_ctx_mgr_init
+mbin_interface sha256_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S
new file mode 100644
index 000000000..06d0ab5fa
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x1_ce.S
@@ -0,0 +1,238 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+ ldr key_q , [tmp]
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ add tmp,tmp,16
+ add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+ sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+ sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+ sha256_4_rounds_high \msg1,\tmp0,\tmp1
+ sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,31
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_efgh,1
+ declare_var_vector_reg l0_abcd_saved,5
+ declare_var_vector_reg l0_efgh_saved,6
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,2
+ declare_var_vector_reg l0_tmp1,3
+ declare_var_vector_reg l0_tmp2,4
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+
+
+
+/*
+ void sha256_mb_ce_x1(SHA1_JOB * l0_job, int len);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ len .req w1
+ l0_data .req x2
+ tmp .req x3
+ .global sha256_mb_ce_x1
+ .type sha256_mb_ce_x1, %function
+sha256_mb_ce_x1:
+ ldr l0_data, [l0_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_efgh_q, [l0_job, 80]
+
+
+
+start_loop:
+ adr tmp, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ ldr key_q,[tmp]
+ add tmp,tmp,16
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ sub len, len, #1
+ cmp len, 0
+ //backup digest
+ mov l0_abcd_saved_v.16b,l0_abcd_v.16b
+ mov l0_efgh_saved_v.16b,l0_efgh_v.16b
+
+ rev32 l0_msg0_v.16b,l0_msg0_v.16b
+ rev32 l0_msg1_v.16b,l0_msg1_v.16b
+ add l0_tmp0_v.4s,l0_msg0_v.4s,key_v.4s
+ rev32 l0_msg2_v.16b,l0_msg2_v.16b
+ rev32 l0_msg3_v.16b,l0_msg3_v.16b
+
+
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */
+ sha256_4_rounds_high msg2,tmp1,tmp0
+ sha256_4_rounds_high msg3,tmp0,tmp1
+
+ /* rounds 60-63 */
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+
+
+ add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+ add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+
+
+ bgt start_loop
+ str l0_abcd_q, [l0_job, 64]
+ str l0_efgh_q, [l0_job, 80]
+
+ ret
+
+ .size sha256_mb_ce_x1, .-sha256_mb_ce_x1
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x428A2F98
+ .word 0x71374491
+ .word 0xB5C0FBCF
+ .word 0xE9B5DBA5
+ .word 0x3956C25B
+ .word 0x59F111F1
+ .word 0x923F82A4
+ .word 0xAB1C5ED5
+ .word 0xD807AA98
+ .word 0x12835B01
+ .word 0x243185BE
+ .word 0x550C7DC3
+ .word 0x72BE5D74
+ .word 0x80DEB1FE
+ .word 0x9BDC06A7
+ .word 0xC19BF174
+ .word 0xE49B69C1
+ .word 0xEFBE4786
+ .word 0x0FC19DC6
+ .word 0x240CA1CC
+ .word 0x2DE92C6F
+ .word 0x4A7484AA
+ .word 0x5CB0A9DC
+ .word 0x76F988DA
+ .word 0x983E5152
+ .word 0xA831C66D
+ .word 0xB00327C8
+ .word 0xBF597FC7
+ .word 0xC6E00BF3
+ .word 0xD5A79147
+ .word 0x06CA6351
+ .word 0x14292967
+ .word 0x27B70A85
+ .word 0x2E1B2138
+ .word 0x4D2C6DFC
+ .word 0x53380D13
+ .word 0x650A7354
+ .word 0x766A0ABB
+ .word 0x81C2C92E
+ .word 0x92722C85
+ .word 0xA2BFE8A1
+ .word 0xA81A664B
+ .word 0xC24B8B70
+ .word 0xC76C51A3
+ .word 0xD192E819
+ .word 0xD6990624
+ .word 0xF40E3585
+ .word 0x106AA070
+ .word 0x19A4C116
+ .word 0x1E376C08
+ .word 0x2748774C
+ .word 0x34B0BCB5
+ .word 0x391C0CB3
+ .word 0x4ED8AA4A
+ .word 0x5B9CCA4F
+ .word 0x682E6FF3
+ .word 0x748F82EE
+ .word 0x78A5636F
+ .word 0x84C87814
+ .word 0x8CC70208
+ .word 0x90BEFFFA
+ .word 0xA4506CEB
+ .word 0xBEF9A3F7
+ .word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S
new file mode 100644
index 000000000..dadf44bb0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x2_ce.S
@@ -0,0 +1,289 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+ ldr key_q , [tmp]
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ mov l1_tmp2_v.16b,l1_abcd_v.16b
+ add tmp,tmp,16
+ add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+ add l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+ sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+ sha256h l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+ sha256h2 l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+ sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+ sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+ sha256_4_rounds_high \msg1,\tmp0,\tmp1
+ sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+ sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,31
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_efgh,1
+ declare_var_vector_reg l0_abcd_saved,2
+ declare_var_vector_reg l0_efgh_saved,3
+ declare_var_vector_reg l1_abcd,4
+ declare_var_vector_reg l1_efgh,5
+ declare_var_vector_reg l1_abcd_saved,6
+ declare_var_vector_reg l1_efgh_saved,7
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,8
+ declare_var_vector_reg l0_tmp1,9
+ declare_var_vector_reg l0_tmp2,10
+ declare_var_vector_reg l1_tmp0,11
+ declare_var_vector_reg l1_tmp1,12
+ declare_var_vector_reg l1_tmp2,13
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+ declare_var_vector_reg l1_msg0,20
+ declare_var_vector_reg l1_msg1,21
+ declare_var_vector_reg l1_msg2,22
+ declare_var_vector_reg l1_msg3,23
+
+
+
+/*
+ void sha256_mb_ce_x2(SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ l1_job .req x1
+ len .req w2
+ l0_data .req x3
+ l1_data .req x4
+ tmp .req x5
+ .global sha256_mb_ce_x2
+ .type sha256_mb_ce_x2, %function
+sha256_mb_ce_x2:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+ ldr l0_data, [l0_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_efgh_q, [l0_job, 80]
+ ldr l1_data, [l1_job]
+ ldr l1_abcd_q, [l1_job, 64]
+ ldr l1_efgh_q, [l1_job, 80]
+
+
+
+start_loop:
+
+ //load key addr
+ adr tmp, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+ ldr key_q,[tmp]
+ add tmp,tmp,16
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ add l1_data,l1_data,64
+ sub len, len, #1
+ cmp len, 0
+ //backup digest
+ mov l0_abcd_saved_v.16b,l0_abcd_v.16b
+ mov l0_efgh_saved_v.16b,l0_efgh_v.16b
+ mov l1_abcd_saved_v.16b,l1_abcd_v.16b
+ mov l1_efgh_saved_v.16b,l1_efgh_v.16b
+
+ rev32 l0_msg0_v.16b,l0_msg0_v.16b
+ rev32 l0_msg1_v.16b,l0_msg1_v.16b
+ add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+ rev32 l0_msg2_v.16b,l0_msg2_v.16b
+ rev32 l0_msg3_v.16b,l0_msg3_v.16b
+
+ rev32 l1_msg0_v.16b,l1_msg0_v.16b
+ rev32 l1_msg1_v.16b,l1_msg1_v.16b
+ add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+ rev32 l1_msg2_v.16b,l1_msg2_v.16b
+ rev32 l1_msg3_v.16b,l1_msg3_v.16b
+
+
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */
+ sha256_4_rounds_high msg2,tmp1,tmp0
+ sha256_4_rounds_high msg3,tmp0,tmp1
+
+ /* rounds 60-63 */
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+ mov l1_tmp2_v.16b,l1_abcd_v.16b
+ sha256h l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s
+ sha256h2 l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s
+
+
+
+ add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+ add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+ add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+ add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+
+
+ bgt start_loop
+ str l0_abcd_q, [l0_job, 64]
+ str l0_efgh_q, [l0_job, 80]
+ str l1_abcd_q, [l1_job, 64]
+ str l1_efgh_q, [l1_job, 80]
+
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .size sha256_mb_ce_x2, .-sha256_mb_ce_x2
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x428A2F98
+ .word 0x71374491
+ .word 0xB5C0FBCF
+ .word 0xE9B5DBA5
+ .word 0x3956C25B
+ .word 0x59F111F1
+ .word 0x923F82A4
+ .word 0xAB1C5ED5
+ .word 0xD807AA98
+ .word 0x12835B01
+ .word 0x243185BE
+ .word 0x550C7DC3
+ .word 0x72BE5D74
+ .word 0x80DEB1FE
+ .word 0x9BDC06A7
+ .word 0xC19BF174
+ .word 0xE49B69C1
+ .word 0xEFBE4786
+ .word 0x0FC19DC6
+ .word 0x240CA1CC
+ .word 0x2DE92C6F
+ .word 0x4A7484AA
+ .word 0x5CB0A9DC
+ .word 0x76F988DA
+ .word 0x983E5152
+ .word 0xA831C66D
+ .word 0xB00327C8
+ .word 0xBF597FC7
+ .word 0xC6E00BF3
+ .word 0xD5A79147
+ .word 0x06CA6351
+ .word 0x14292967
+ .word 0x27B70A85
+ .word 0x2E1B2138
+ .word 0x4D2C6DFC
+ .word 0x53380D13
+ .word 0x650A7354
+ .word 0x766A0ABB
+ .word 0x81C2C92E
+ .word 0x92722C85
+ .word 0xA2BFE8A1
+ .word 0xA81A664B
+ .word 0xC24B8B70
+ .word 0xC76C51A3
+ .word 0xD192E819
+ .word 0xD6990624
+ .word 0xF40E3585
+ .word 0x106AA070
+ .word 0x19A4C116
+ .word 0x1E376C08
+ .word 0x2748774C
+ .word 0x34B0BCB5
+ .word 0x391C0CB3
+ .word 0x4ED8AA4A
+ .word 0x5B9CCA4F
+ .word 0x682E6FF3
+ .word 0x748F82EE
+ .word 0x78A5636F
+ .word 0x84C87814
+ .word 0x8CC70208
+ .word 0x90BEFFFA
+ .word 0xA4506CEB
+ .word 0xBEF9A3F7
+ .word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S
new file mode 100644
index 000000000..6ed1591ba
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x3_ce.S
@@ -0,0 +1,342 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 48-63
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req,tmp1:req
+ ldr key_q , [tmp]
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ mov l1_tmp2_v.16b,l1_abcd_v.16b
+ mov l2_tmp2_v.16b,l2_abcd_v.16b
+ add tmp,tmp,16
+ add l0_\tmp1\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+ add l1_\tmp1\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+ add l2_\tmp1\()_v.4s,l2_\msg\()_v.4s,key_v.4s
+ sha256h l0_abcd_q,l0_efgh_q,l0_\tmp0\()_v.4s
+ sha256h l1_abcd_q,l1_efgh_q,l1_\tmp0\()_v.4s
+ sha256h l2_abcd_q,l2_efgh_q,l2_\tmp0\()_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_\tmp0\()_v.4s
+ sha256h2 l1_efgh_q,l1_tmp2_q,l1_\tmp0\()_v.4s
+ sha256h2 l2_efgh_q,l2_tmp2_q,l2_\tmp0\()_v.4s
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req,tmp1:req
+ sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+ sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+ sha256su0 l2_\msg0\()_v.4s,l2_\msg1\()_v.4s
+ sha256_4_rounds_high \msg1,\tmp0,\tmp1
+ sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+ sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+ sha256su1 l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,31
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_efgh,1
+ declare_var_vector_reg l1_abcd,2
+ declare_var_vector_reg l1_efgh,3
+ declare_var_vector_reg l2_abcd,4
+ declare_var_vector_reg l2_efgh,5
+ declare_var_vector_reg l1_abcd_saved,16
+ declare_var_vector_reg l1_efgh_saved,17
+ declare_var_vector_reg l0_abcd_saved,20
+ declare_var_vector_reg l0_efgh_saved,21
+ declare_var_vector_reg l2_abcd_saved,24
+ declare_var_vector_reg l2_efgh_saved,25
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,6
+ declare_var_vector_reg l0_tmp1,7
+ declare_var_vector_reg l0_tmp2,8
+ declare_var_vector_reg l1_tmp0,9
+ declare_var_vector_reg l1_tmp1,10
+ declare_var_vector_reg l1_tmp2,11
+ declare_var_vector_reg l2_tmp0,12
+ declare_var_vector_reg l2_tmp1,13
+ declare_var_vector_reg l2_tmp2,14
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+ declare_var_vector_reg l1_msg0,20
+ declare_var_vector_reg l1_msg1,21
+ declare_var_vector_reg l1_msg2,22
+ declare_var_vector_reg l1_msg3,23
+ declare_var_vector_reg l2_msg0,24
+ declare_var_vector_reg l2_msg1,25
+ declare_var_vector_reg l2_msg2,26
+ declare_var_vector_reg l2_msg3,27
+
+
+
+/*
+ void sha256_mb_ce_x3(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ l1_job .req x1
+ l2_job .req x2
+ len .req w3
+ l0_data .req x4
+ l1_data .req x5
+ l2_data .req x6
+ tmp .req x7
+ .global sha256_mb_ce_x3
+ .type sha256_mb_ce_x3, %function
+sha256_mb_ce_x3:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+ ldr l0_data, [l0_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_efgh_q, [l0_job, 80]
+ ldr l1_data, [l1_job]
+ ldr l1_abcd_q, [l1_job, 64]
+ ldr l1_efgh_q, [l1_job, 80]
+ ldr l2_data, [l2_job]
+ ldr l2_abcd_q, [l2_job, 64]
+ ldr l2_efgh_q, [l2_job, 80]
+
+
+
+start_loop:
+
+ //load key addr
+ adr tmp, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+ ld1 {l2_msg0_v.4s-l2_msg3_v.4s},[l2_data]
+ ldr key_q,[tmp]
+ add tmp,tmp,16
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ add l1_data,l1_data,64
+ add l2_data,l2_data,64
+ sub len, len, #1
+ cmp len, 0
+/*
+ //backup digest
+ mov l0_abcd_saved_v.16b,l0_abcd_v.16b
+ mov l0_efgh_saved_v.16b,l0_efgh_v.16b
+ mov l1_abcd_saved_v.16b,l1_abcd_v.16b
+ mov l1_efgh_saved_v.16b,l1_efgh_v.16b
+ mov l2_abcd_saved_v.16b,l2_abcd_v.16b
+ mov l2_efgh_saved_v.16b,l2_efgh_v.16b
+*/
+
+ rev32 l0_msg0_v.16b,l0_msg0_v.16b
+ rev32 l0_msg1_v.16b,l0_msg1_v.16b
+ add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+ rev32 l0_msg2_v.16b,l0_msg2_v.16b
+ rev32 l0_msg3_v.16b,l0_msg3_v.16b
+
+ rev32 l1_msg0_v.16b,l1_msg0_v.16b
+ rev32 l1_msg1_v.16b,l1_msg1_v.16b
+ add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+ rev32 l1_msg2_v.16b,l1_msg2_v.16b
+ rev32 l1_msg3_v.16b,l1_msg3_v.16b
+
+ rev32 l2_msg0_v.16b,l2_msg0_v.16b
+ rev32 l2_msg1_v.16b,l2_msg1_v.16b
+ add l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s
+ rev32 l2_msg2_v.16b,l2_msg2_v.16b
+ rev32 l2_msg3_v.16b,l2_msg3_v.16b
+
+
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 0-3 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 16-19 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0,tmp1 /* rounds 32-35 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp1,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0,tmp1
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp1,tmp0
+
+
+
+ sha256_4_rounds_high msg1,tmp0,tmp1 /* rounds 48-51 */
+
+ /* msg0 msg1 is free , share with digest regs */
+ ldr l0_abcd_saved_q, [l0_job, 64]
+ ldr l1_abcd_saved_q, [l1_job, 64]
+ ldr l2_abcd_saved_q, [l2_job, 64]
+ ldr l0_efgh_saved_q, [l0_job, 80]
+ ldr l1_efgh_saved_q, [l1_job, 80]
+ ldr l2_efgh_saved_q, [l2_job, 80]
+
+ sha256_4_rounds_high msg2,tmp1,tmp0
+ sha256_4_rounds_high msg3,tmp0,tmp1
+
+ /* rounds 60-63 */
+ mov l0_tmp2_v.16b,l0_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,l0_tmp1_v.4s
+ sha256h2 l0_efgh_q,l0_tmp2_q,l0_tmp1_v.4s
+
+ mov l1_tmp2_v.16b,l1_abcd_v.16b
+ sha256h l1_abcd_q,l1_efgh_q,l1_tmp1_v.4s
+ sha256h2 l1_efgh_q,l1_tmp2_q,l1_tmp1_v.4s
+
+ mov l2_tmp2_v.16b,l2_abcd_v.16b
+ sha256h l2_abcd_q,l2_efgh_q,l2_tmp1_v.4s
+ sha256h2 l2_efgh_q,l2_tmp2_q,l2_tmp1_v.4s
+
+ /* combine state */
+ add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+ add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+ add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+ add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+ add l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s
+ add l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s
+
+ str l0_abcd_q, [l0_job, 64]
+ str l0_efgh_q, [l0_job, 80]
+ str l1_abcd_q, [l1_job, 64]
+ str l1_efgh_q, [l1_job, 80]
+ str l2_abcd_q, [l2_job, 64]
+ str l2_efgh_q, [l2_job, 80]
+
+ bgt start_loop
+
+
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .size sha256_mb_ce_x3, .-sha256_mb_ce_x3
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x428A2F98
+ .word 0x71374491
+ .word 0xB5C0FBCF
+ .word 0xE9B5DBA5
+ .word 0x3956C25B
+ .word 0x59F111F1
+ .word 0x923F82A4
+ .word 0xAB1C5ED5
+ .word 0xD807AA98
+ .word 0x12835B01
+ .word 0x243185BE
+ .word 0x550C7DC3
+ .word 0x72BE5D74
+ .word 0x80DEB1FE
+ .word 0x9BDC06A7
+ .word 0xC19BF174
+ .word 0xE49B69C1
+ .word 0xEFBE4786
+ .word 0x0FC19DC6
+ .word 0x240CA1CC
+ .word 0x2DE92C6F
+ .word 0x4A7484AA
+ .word 0x5CB0A9DC
+ .word 0x76F988DA
+ .word 0x983E5152
+ .word 0xA831C66D
+ .word 0xB00327C8
+ .word 0xBF597FC7
+ .word 0xC6E00BF3
+ .word 0xD5A79147
+ .word 0x06CA6351
+ .word 0x14292967
+ .word 0x27B70A85
+ .word 0x2E1B2138
+ .word 0x4D2C6DFC
+ .word 0x53380D13
+ .word 0x650A7354
+ .word 0x766A0ABB
+ .word 0x81C2C92E
+ .word 0x92722C85
+ .word 0xA2BFE8A1
+ .word 0xA81A664B
+ .word 0xC24B8B70
+ .word 0xC76C51A3
+ .word 0xD192E819
+ .word 0xD6990624
+ .word 0xF40E3585
+ .word 0x106AA070
+ .word 0x19A4C116
+ .word 0x1E376C08
+ .word 0x2748774C
+ .word 0x34B0BCB5
+ .word 0x391C0CB3
+ .word 0x4ED8AA4A
+ .word 0x5B9CCA4F
+ .word 0x682E6FF3
+ .word 0x748F82EE
+ .word 0x78A5636F
+ .word 0x84C87814
+ .word 0x8CC70208
+ .word 0x90BEFFFA
+ .word 0xA4506CEB
+ .word 0xBEF9A3F7
+ .word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S
new file mode 100644
index 000000000..b1686ada1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/aarch64/sha256_mb_x4_ce.S
@@ -0,0 +1,380 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8-a+crypto
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 48-63
+tmp0 : in
+tmp1 : out
+*/
+.macro sha256_4_rounds_high msg:req,tmp0:req
+ ldr key_q , [tmp]
+ mov tmp0_v.16b,l0_\tmp0\()_v.16b
+ mov tmp1_v.16b,l1_\tmp0\()_v.16b
+ add l0_\tmp0\()_v.4s,l0_\msg\()_v.4s,key_v.4s
+ add l1_\tmp0\()_v.4s,l1_\msg\()_v.4s,key_v.4s
+ mov tmp2_v.16b,l0_abcd_v.16b
+ mov tmp3_v.16b,l1_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,tmp0_v.4s
+ sha256h l1_abcd_q,l1_efgh_q,tmp1_v.4s
+ sha256h2 l0_efgh_q,tmp2_q,tmp0_v.4s
+ sha256h2 l1_efgh_q,tmp3_q,tmp1_v.4s
+
+ ldr key_q , [tmp]
+ mov tmp0_v.16b,l2_\tmp0\()_v.16b
+ mov tmp1_v.16b,l3_\tmp0\()_v.16b
+ add tmp,tmp,16
+ add l2_\tmp0\()_v.4s,l2_\msg\()_v.4s,key_v.4s
+ add l3_\tmp0\()_v.4s,l3_\msg\()_v.4s,key_v.4s
+ mov tmp2_v.16b,l2_abcd_v.16b
+ mov tmp3_v.16b,l3_abcd_v.16b
+ sha256h l2_abcd_q,l2_efgh_q,tmp0_v.4s
+ sha256h l3_abcd_q,l3_efgh_q,tmp1_v.4s
+ sha256h2 l2_efgh_q,tmp2_q,tmp0_v.4s
+ sha256h2 l3_efgh_q,tmp3_q,tmp1_v.4s
+
+
+.endm
+/**
+maros for round 0-47
+*/
+.macro sha256_4_rounds_low msg0:req,msg1:req,msg2:req,msg3:req,tmp0:req
+ sha256su0 l0_\msg0\()_v.4s,l0_\msg1\()_v.4s
+ sha256su0 l1_\msg0\()_v.4s,l1_\msg1\()_v.4s
+ sha256su0 l2_\msg0\()_v.4s,l2_\msg1\()_v.4s
+ sha256su0 l3_\msg0\()_v.4s,l3_\msg1\()_v.4s
+ sha256_4_rounds_high \msg1,\tmp0
+ sha256su1 l0_\msg0\()_v.4s,l0_\msg2\()_v.4s,l0_\msg3\()_v.4s
+ sha256su1 l1_\msg0\()_v.4s,l1_\msg2\()_v.4s,l1_\msg3\()_v.4s
+ sha256su1 l2_\msg0\()_v.4s,l2_\msg2\()_v.4s,l2_\msg3\()_v.4s
+ sha256su1 l3_\msg0\()_v.4s,l3_\msg2\()_v.4s,l3_\msg3\()_v.4s
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,15
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_abcd,0
+ declare_var_vector_reg l0_efgh,1
+ declare_var_vector_reg l1_abcd,2
+ declare_var_vector_reg l1_efgh,3
+ declare_var_vector_reg l2_abcd,4
+ declare_var_vector_reg l2_efgh,5
+ declare_var_vector_reg l3_abcd,6
+ declare_var_vector_reg l3_efgh,7
+ declare_var_vector_reg l1_abcd_saved,16
+ declare_var_vector_reg l1_efgh_saved,17
+ declare_var_vector_reg l0_abcd_saved,20
+ declare_var_vector_reg l0_efgh_saved,21
+ declare_var_vector_reg l2_abcd_saved,24
+ declare_var_vector_reg l2_efgh_saved,25
+ declare_var_vector_reg l3_abcd_saved,28
+ declare_var_vector_reg l3_efgh_saved,29
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,8
+ declare_var_vector_reg l1_tmp0,9
+ declare_var_vector_reg l2_tmp0,10
+ declare_var_vector_reg l3_tmp0,11
+
+ declare_var_vector_reg tmp0,12
+ declare_var_vector_reg tmp1,13
+ declare_var_vector_reg tmp2,14
+ declare_var_vector_reg tmp3,15
+
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+ declare_var_vector_reg l1_msg0,20
+ declare_var_vector_reg l1_msg1,21
+ declare_var_vector_reg l1_msg2,22
+ declare_var_vector_reg l1_msg3,23
+ declare_var_vector_reg l2_msg0,24
+ declare_var_vector_reg l2_msg1,25
+ declare_var_vector_reg l2_msg2,26
+ declare_var_vector_reg l2_msg3,27
+ declare_var_vector_reg l3_msg0,28
+ declare_var_vector_reg l3_msg1,29
+ declare_var_vector_reg l3_msg2,30
+ declare_var_vector_reg l3_msg3,31
+
+
+
+/*
+ void sha256_mb_ce_x4(SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, SHA256_JOB *, int);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ l1_job .req x1
+ l2_job .req x2
+ l3_job .req x3
+ len .req w4
+ l0_data .req x5
+ l1_data .req x6
+ l2_data .req x7
+ l3_data .req x8
+ tmp .req x9
+ .global sha256_mb_ce_x4
+ .type sha256_mb_ce_x4, %function
+sha256_mb_ce_x4:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+ ldr l0_data, [l0_job]
+ ldr l0_abcd_q, [l0_job, 64]
+ ldr l0_efgh_q, [l0_job, 80]
+ ldr l1_data, [l1_job]
+ ldr l1_abcd_q, [l1_job, 64]
+ ldr l1_efgh_q, [l1_job, 80]
+ ldr l2_data, [l2_job]
+ ldr l2_abcd_q, [l2_job, 64]
+ ldr l2_efgh_q, [l2_job, 80]
+ ldr l3_data, [l3_job]
+ ldr l3_abcd_q, [l3_job, 64]
+ ldr l3_efgh_q, [l3_job, 80]
+
+
+
+start_loop:
+
+ //load key addr
+ adr tmp, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+ ld1 {l2_msg0_v.4s-l2_msg3_v.4s},[l2_data]
+ ld1 {l3_msg0_v.4s-l3_msg3_v.4s},[l3_data]
+ ldr key_q,[tmp]
+ add tmp,tmp,16
+ //adjust loop parameter
+ add l0_data,l0_data,64
+ add l1_data,l1_data,64
+ add l2_data,l2_data,64
+ add l3_data,l3_data,64
+ sub len, len, #1
+ cmp len, 0
+
+
+ rev32 l0_msg0_v.16b,l0_msg0_v.16b
+ rev32 l0_msg1_v.16b,l0_msg1_v.16b
+ add l0_tmp0_v.4s, l0_msg0_v.4s,key_v.4s
+ rev32 l0_msg2_v.16b,l0_msg2_v.16b
+ rev32 l0_msg3_v.16b,l0_msg3_v.16b
+
+ rev32 l1_msg0_v.16b,l1_msg0_v.16b
+ rev32 l1_msg1_v.16b,l1_msg1_v.16b
+ add l1_tmp0_v.4s, l1_msg0_v.4s,key_v.4s
+ rev32 l1_msg2_v.16b,l1_msg2_v.16b
+ rev32 l1_msg3_v.16b,l1_msg3_v.16b
+
+ rev32 l2_msg0_v.16b,l2_msg0_v.16b
+ rev32 l2_msg1_v.16b,l2_msg1_v.16b
+ add l2_tmp0_v.4s, l2_msg0_v.4s,key_v.4s
+ rev32 l2_msg2_v.16b,l2_msg2_v.16b
+ rev32 l2_msg3_v.16b,l2_msg3_v.16b
+
+ rev32 l3_msg0_v.16b,l3_msg0_v.16b
+ rev32 l3_msg1_v.16b,l3_msg1_v.16b
+ add l3_tmp0_v.4s, l3_msg0_v.4s,key_v.4s
+ rev32 l3_msg2_v.16b,l3_msg2_v.16b
+ rev32 l3_msg3_v.16b,l3_msg3_v.16b
+
+
+
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 0-3 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 16-19 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0
+ sha256_4_rounds_low msg0,msg1,msg2,msg3,tmp0 /* rounds 32-35 */
+ sha256_4_rounds_low msg1,msg2,msg3,msg0,tmp0
+ sha256_4_rounds_low msg2,msg3,msg0,msg1,tmp0
+ sha256_4_rounds_low msg3,msg0,msg1,msg2,tmp0
+
+
+
+ sha256_4_rounds_high msg1,tmp0 /* rounds 48-51 */
+
+ /* msg0 msg1 is free , share with digest regs */
+ ldr l0_abcd_saved_q, [l0_job, 64]
+ ldr l1_abcd_saved_q, [l1_job, 64]
+ ldr l2_abcd_saved_q, [l2_job, 64]
+ ldr l3_abcd_saved_q, [l3_job, 64]
+ ldr l0_efgh_saved_q, [l0_job, 80]
+ ldr l1_efgh_saved_q, [l1_job, 80]
+ ldr l2_efgh_saved_q, [l2_job, 80]
+ ldr l3_efgh_saved_q, [l3_job, 80]
+
+ sha256_4_rounds_high msg2,tmp0
+ sha256_4_rounds_high msg3,tmp0
+
+ /* rounds 60-63 */
+ mov tmp2_v.16b,l0_abcd_v.16b
+ sha256h l0_abcd_q,l0_efgh_q,l0_tmp0_v.4s
+ sha256h2 l0_efgh_q,tmp2_q,l0_tmp0_v.4s
+
+ mov tmp2_v.16b,l1_abcd_v.16b
+ sha256h l1_abcd_q,l1_efgh_q,l1_tmp0_v.4s
+ sha256h2 l1_efgh_q,tmp2_q,l1_tmp0_v.4s
+
+ mov tmp2_v.16b,l2_abcd_v.16b
+ sha256h l2_abcd_q,l2_efgh_q,l2_tmp0_v.4s
+ sha256h2 l2_efgh_q,tmp2_q,l2_tmp0_v.4s
+
+ mov tmp2_v.16b,l3_abcd_v.16b
+ sha256h l3_abcd_q,l3_efgh_q,l3_tmp0_v.4s
+ sha256h2 l3_efgh_q,tmp2_q,l3_tmp0_v.4s
+
+ /* combine state */
+ add l0_abcd_v.4s,l0_abcd_v.4s,l0_abcd_saved_v.4s
+ add l0_efgh_v.4s,l0_efgh_v.4s,l0_efgh_saved_v.4s
+ add l1_abcd_v.4s,l1_abcd_v.4s,l1_abcd_saved_v.4s
+ add l1_efgh_v.4s,l1_efgh_v.4s,l1_efgh_saved_v.4s
+ add l2_abcd_v.4s,l2_abcd_v.4s,l2_abcd_saved_v.4s
+ add l2_efgh_v.4s,l2_efgh_v.4s,l2_efgh_saved_v.4s
+ add l3_abcd_v.4s,l3_abcd_v.4s,l3_abcd_saved_v.4s
+ add l3_efgh_v.4s,l3_efgh_v.4s,l3_efgh_saved_v.4s
+
+ str l0_abcd_q, [l0_job, 64]
+ str l0_efgh_q, [l0_job, 80]
+ str l1_abcd_q, [l1_job, 64]
+ str l1_efgh_q, [l1_job, 80]
+ str l2_abcd_q, [l2_job, 64]
+ str l2_efgh_q, [l2_job, 80]
+ str l3_abcd_q, [l3_job, 64]
+ str l3_efgh_q, [l3_job, 80]
+
+ bgt start_loop
+
+
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .size sha256_mb_ce_x4, .-sha256_mb_ce_x4
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .word 0x428A2F98
+ .word 0x71374491
+ .word 0xB5C0FBCF
+ .word 0xE9B5DBA5
+ .word 0x3956C25B
+ .word 0x59F111F1
+ .word 0x923F82A4
+ .word 0xAB1C5ED5
+ .word 0xD807AA98
+ .word 0x12835B01
+ .word 0x243185BE
+ .word 0x550C7DC3
+ .word 0x72BE5D74
+ .word 0x80DEB1FE
+ .word 0x9BDC06A7
+ .word 0xC19BF174
+ .word 0xE49B69C1
+ .word 0xEFBE4786
+ .word 0x0FC19DC6
+ .word 0x240CA1CC
+ .word 0x2DE92C6F
+ .word 0x4A7484AA
+ .word 0x5CB0A9DC
+ .word 0x76F988DA
+ .word 0x983E5152
+ .word 0xA831C66D
+ .word 0xB00327C8
+ .word 0xBF597FC7
+ .word 0xC6E00BF3
+ .word 0xD5A79147
+ .word 0x06CA6351
+ .word 0x14292967
+ .word 0x27B70A85
+ .word 0x2E1B2138
+ .word 0x4D2C6DFC
+ .word 0x53380D13
+ .word 0x650A7354
+ .word 0x766A0ABB
+ .word 0x81C2C92E
+ .word 0x92722C85
+ .word 0xA2BFE8A1
+ .word 0xA81A664B
+ .word 0xC24B8B70
+ .word 0xC76C51A3
+ .word 0xD192E819
+ .word 0xD6990624
+ .word 0xF40E3585
+ .word 0x106AA070
+ .word 0x19A4C116
+ .word 0x1E376C08
+ .word 0x2748774C
+ .word 0x34B0BCB5
+ .word 0x391C0CB3
+ .word 0x4ED8AA4A
+ .word 0x5B9CCA4F
+ .word 0x682E6FF3
+ .word 0x748F82EE
+ .word 0x78A5636F
+ .word 0x84C87814
+ .word 0x8CC70208
+ .word 0x90BEFFFA
+ .word 0xA4506CEB
+ .word 0xBEF9A3F7
+ .word 0xC67178F2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
new file mode 100644
index 000000000..12441a8e3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
@@ -0,0 +1,268 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx_slver_02020154;
+struct slver sha256_ctx_mgr_init_avx_slver = { 0x0154, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_submit_avx_slver_02020155;
+struct slver sha256_ctx_mgr_submit_avx_slver = { 0x0155, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_flush_avx_slver_02020156;
+struct slver sha256_ctx_mgr_flush_avx_slver = { 0x0156, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
new file mode 100644
index 000000000..9c045659e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
@@ -0,0 +1,268 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx2(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx2_slver_04020157;
+struct slver sha256_ctx_mgr_init_avx2_slver = { 0x0157, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_submit_avx2_slver_04020158;
+struct slver sha256_ctx_mgr_submit_avx2_slver = { 0x0158, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_flush_avx2_slver_04020159;
+struct slver sha256_ctx_mgr_flush_avx2_slver = { 0x0159, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
new file mode 100644
index 000000000..a1f068987
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
@@ -0,0 +1,273 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx512_slver_0600015a;
+struct slver sha256_ctx_mgr_init_avx512_slver = { 0x015a, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_submit_avx512_slver_0600015b;
+struct slver sha256_ctx_mgr_submit_avx512_slver = { 0x015b, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_flush_avx512_slver_0600015c;
+struct slver sha256_ctx_mgr_flush_avx512_slver = { 0x015c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c
new file mode 100644
index 000000000..763057f12
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512_ni.c
@@ -0,0 +1,283 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+/**
+ * sha256_ctx_avx512_ni related functions are aiming to utilize Canon Lake.
+ * Since SHANI is still slower than multibuffer for full lanes,
+ * sha256_ctx_mgr_init_avx512_ni and sha256_ctx_mgr_submit_avx512_ni are
+ * similare with their avx512 versions.
+ * sha256_ctx_mgr_flush_avx512_ni is different. It will call
+ * sha256_mb_mgr_flush_avx512_ni which would use shani when lanes are less
+ * than a threshold.
+ *
+ */
+#if defined(HAVE_AS_KNOWS_AVX512) && defined(HAVE_AS_KNOWS_SHANI)
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx512_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512_ni(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx, const void *buffer,
+ uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512_ni(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx512_ni_slver_080002ca;
+struct slver sha256_ctx_mgr_init_avx512_ni_slver = { 0x02ca, 0x00, 0x08 };
+
+struct slver sha256_ctx_mgr_submit_avx512_ni_slver_080002cb;
+struct slver sha256_ctx_mgr_submit_avx512_ni_slver = { 0x02cb, 0x00, 0x08 };
+
+struct slver sha256_ctx_mgr_flush_avx512_ni_slver_080002cc;
+struct slver sha256_ctx_mgr_flush_avx512_ni_slver = { 0x02cc, 0x00, 0x08 };
+
+#endif // HAVE_AS_KNOWS_AVX512 and HAVE_AS_KNOWS_SHANI
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c
new file mode 100644
index 000000000..58bf024a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base.c
@@ -0,0 +1,301 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be32(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sha256_single(const void *data, uint32_t digest[]);
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+
+void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr)
+{
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ uint32_t remain_len;
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+ // Cannot submit a new entire job to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags == HASH_FIRST) {
+
+ sha256_init(ctx, buffer, len);
+ sha256_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_UPDATE) {
+ sha256_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_LAST) {
+ remain_len = sha256_update(ctx, buffer, len);
+ sha256_final(ctx, remain_len);
+ }
+
+ if (flags == HASH_ENTIRE) {
+ sha256_init(ctx, buffer, len);
+ remain_len = sha256_update(ctx, buffer, len);
+ sha256_final(ctx, remain_len);
+ }
+
+ return ctx;
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr)
+{
+ return NULL;
+}
+
+static void sha256_init(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Mark it as processing
+ ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sha256_update(SHA256_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ uint32_t remain_len = len;
+ uint32_t *digest = ctx->job.result_digest;
+
+ while (remain_len >= SHA256_BLOCK_SIZE) {
+ sha256_single(buffer, digest);
+ buffer = (void *)((uint8_t *) buffer + SHA256_BLOCK_SIZE);
+ remain_len -= SHA256_BLOCK_SIZE;
+ ctx->total_length += SHA256_BLOCK_SIZE;
+ }
+ ctx->status = HASH_CTX_STS_IDLE;
+ ctx->incoming_buffer = buffer;
+ return remain_len;
+}
+
+static void sha256_final(SHA256_HASH_CTX * ctx, uint32_t remain_len)
+{
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t i = remain_len, j;
+ uint8_t buf[2 * SHA256_BLOCK_SIZE];
+ uint32_t *digest = ctx->job.result_digest;
+
+ ctx->total_length += i;
+ memcpy(buf, buffer, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA256_BLOCK_SIZE;
+ else
+ i = SHA256_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+ sha256_single(buf, digest);
+ if (i == 2 * SHA256_BLOCK_SIZE) {
+ sha256_single(buf + SHA256_BLOCK_SIZE, digest);
+ }
+
+ ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+void sha256_single(const void *data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e, f, g, h, t1, t2;
+ uint32_t w[16];
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+ step(1, h, a, b, c, d, e, f, g, 0x71374491);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+ step(31, b, c, d, e, f, g, h, a, 0x14292967);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+ step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+ step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_base_slver_000002f0;
+struct slver sha256_ctx_mgr_init_base_slver = { 0x02f0, 0x00, 0x00 };
+
+struct slver sha256_ctx_mgr_submit_base_slver_000002f1;
+struct slver sha256_ctx_mgr_submit_base_slver = { 0x02f1, 0x00, 0x00 };
+
+struct slver sha256_ctx_mgr_flush_base_slver_000002f2;
+struct slver sha256_ctx_mgr_flush_base_slver = { 0x02f2, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c
new file mode 100644
index 000000000..1483f631c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+
+extern void sha256_ctx_mgr_init_base(SHA256_HASH_CTX_MGR * mgr);
+extern SHA256_HASH_CTX *sha256_ctx_mgr_submit_base(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx, const void *buffer,
+ uint32_t len, HASH_CTX_FLAG flags);
+extern SHA256_HASH_CTX *sha256_ctx_mgr_flush_base(SHA256_HASH_CTX_MGR * mgr);
+
+void sha256_ctx_mgr_init(SHA256_HASH_CTX_MGR * mgr)
+{
+ return sha256_ctx_mgr_init_base(mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ return sha256_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush(SHA256_HASH_CTX_MGR * mgr)
+{
+ return sha256_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
new file mode 100644
index 000000000..f85f5c88b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_sse_slver_00020151;
+struct slver sha256_ctx_mgr_init_sse_slver = { 0x0151, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_submit_sse_slver_00020152;
+struct slver sha256_ctx_mgr_submit_sse_slver = { 0x0152, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_flush_sse_slver_00020153;
+struct slver sha256_ctx_mgr_flush_sse_slver = { 0x0153, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c
new file mode 100644
index 000000000..e2c7e2738
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse_ni.c
@@ -0,0 +1,262 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_SHANI
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_sse_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+ // Same with sse
+ sha256_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse_ni(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse_ni(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse_ni(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse_ni(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA256_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_sse_ni_slver_070002c7;
+struct slver sha256_ctx_mgr_init_sse_ni_slver = { 0x02c7, 0x00, 0x07 };
+
+struct slver sha256_ctx_mgr_submit_sse_ni_slver_070002c8;
+struct slver sha256_ctx_mgr_submit_sse_ni_slver = { 0x02c8, 0x00, 0x07 };
+
+struct slver sha256_ctx_mgr_flush_sse_ni_slver_070002c9;
+struct slver sha256_ctx_mgr_flush_sse_ni_slver = { 0x02c9, 0x00, 0x07 };
+
+#endif // HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
new file mode 100644
index 000000000..f9fb6d230
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
@@ -0,0 +1,65 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SHA256_SB_THRESHOLD_SSE 1
+%define SHA256_SB_THRESHOLD_AVX 1
+%define SHA256_SB_THRESHOLD_AVX2 1
+%define SHA256_SB_THRESHOLD_AVX512 1
+%define SHA256_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha256_mb
+%define SHA256_NI_SB_THRESHOLD_AVX512 6
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA256_JOB
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 8, 8 ; length in bytes
+FIELD _result_digest, 8*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+
+%assign _SHA256_JOB_size _FIELD_OFFSET
+%assign _SHA256_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c
new file mode 100644
index 000000000..28f1f5118
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_flush_test.c
@@ -0,0 +1,146 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS (SHA256_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SHA256_HASH_CTX_MGR * mgr)
+{
+ static int32_t last_lens[SHA256_MAX_LANES] = { 0 };
+ int32_t len;
+ uint8_t num_unchanged = 0;
+ int i;
+ for (i = 0; i < SHA256_MAX_LANES; i++) {
+ len = (int32_t) mgr->mgr.lens[i];
+ // len[i] in mgr consists of byte_length<<4 | lane_index
+ len = (len >= 16) ? (len >> 4 << 6) : 0;
+ printf("\t%d", len);
+ if (last_lens[i] > 0 && last_lens[i] == len)
+ num_unchanged += 1;
+ last_lens[i] = len;
+ }
+ printf("\n");
+ return num_unchanged;
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ uint8_t num_ret, num_unchanged = 0;
+ int ret;
+
+ printf("sha256_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ lens[i] = TEST_LEN / SHA256_MAX_LANES * (i + 1);
+ bufs[i] = (unsigned char *)malloc(lens[i]);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], lens[i]);
+ }
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ printf("Changes of lens inside mgr:\n");
+ lens_print_and_check(mgr);
+ while (sha256_ctx_mgr_flush(mgr)) {
+ num_ret = lens_print_and_check(mgr);
+ num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+ }
+ printf("Info of sha256_mb lens prints over\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else if (num_unchanged)
+ printf("SHA-NI is used when %d or %d jobs are uncompleted\n",
+ num_unchanged, num_unchanged + 1);
+ else
+ printf("SHA-NI is not used, or used for last job\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..ebba9ca36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA256_ARGS_X16
+;;; name size align
+FIELD _digest, 4*8*16, 4 ; transposed digest
+FIELD _data_ptr, 8*16, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SHA256_ARGS_X4_size _FIELD_OFFSET
+%assign _SHA256_ARGS_X4_align _STRUCT_ALIGN
+%assign _SHA256_ARGS_X8_size _FIELD_OFFSET
+%assign _SHA256_ARGS_X8_align _STRUCT_ALIGN
+%assign _SHA256_ARGS_X16_size _FIELD_OFFSET
+%assign _SHA256_ARGS_X16_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
+FIELD _lens, 4*16, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..69f27f42d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
@@ -0,0 +1,253 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx, function
+sha256_mb_mgr_flush_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; avx has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..0ee0589cf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
@@ -0,0 +1,274 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha256_mb_x8_avx2 and sha256_opt_x1
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx2(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx2, function
+sha256_mb_mgr_flush_avx2:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX2
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x2000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..201cd42b0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
@@ -0,0 +1,288 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1
+%define idx rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx512(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx512, function
+sha256_mb_mgr_flush_avx512:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_AVX512
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_flush_avx512
+no_sha256_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
new file mode 100644
index 000000000..7bc9d32a4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512_ni.asm
@@ -0,0 +1,295 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+
+extern sha256_mb_x16_avx512
+extern sha256_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha256_mb_x16_avx2 and sha256_opt_x1
+%define idx rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx512_ni(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_avx512_ni, function
+sha256_mb_mgr_flush_avx512_ni:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+ cmp dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_AVX512
+ ja mb_processing
+
+ ; lensN-len2=idx
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x4000 ; avx2 has 8 lanes *4, r10b is idx, r10b2 is 32
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_ni_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+ %else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_flush_avx512_ni
+ no_sha256_mb_mgr_flush_avx512_ni:
+ %endif
+ %endif ; HAVE_AS_KNOWS_SHANI
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_flush_avx512_ni
+ no_sha256_mb_mgr_flush_avx512_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..69ae4bad5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_sse
+extern sha256_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_sse(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_sse, function
+sha256_mb_mgr_flush_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with sha-sb threshold, if num_lanes_inuse <= threshold, using sb func
+ cmp dword [state + _num_lanes_inuse], SHA256_SB_THRESHOLD_SSE
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_opt_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm
new file mode 100644
index 000000000..43b8fcbe4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse_ni.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha256_mb_x4_sse
+extern sha256_ni_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_sse_ni(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha256_mb_mgr_flush_sse_ni, function
+sha256_mb_mgr_flush_sse_ni:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ ; compare with shani-sb threshold, if num_lanes_inuse <= threshold, using shani func
+ cmp dword [state + _num_lanes_inuse], SHA256_NI_SB_THRESHOLD_SSE
+ ja mb_processing
+
+ ; lensN-len2=idx
+ shr len2, 4
+ mov [state + _lens + idx*4], DWORD(idx)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_ni_x1
+ ; state and idx are intact
+ jmp len_is_0
+
+mb_processing:
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_flush_sse_ni
+ no_sha256_mb_mgr_flush_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..903fb733b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx2(SHA256_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA256_X8_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..b875735f9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx512(SHA256_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA256_MAX_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
new file mode 100644
index 000000000..cf22c4aee
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_sse(SHA256_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF3210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA256_MIN_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..cb7d5790a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
@@ -0,0 +1,260 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE 16*10
+%define _GPR_SAVE 8*5
+%define STACK_SPACE _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx, function
+sha256_mb_mgr_submit_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ vmovdqa [rsp + 16*7], xmm13
+ vmovdqa [rsp + 16*8], xmm14
+ vmovdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqa xmm0, [job + _result_digest + 0*16]
+ vmovdqa xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*16], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ vmovdqa xmm13, [rsp + 16*7]
+ vmovdqa xmm14, [rsp + 16*8]
+ vmovdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..af2fc89ea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
@@ -0,0 +1,246 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx2(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx2, function
+sha256_mb_mgr_submit_avx2:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*8], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*8], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*8], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*8], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*8], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*8], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*8], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*8], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x8_avx2
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..cdc477370
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+%define num_lanes_inuse r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx512(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_avx512, function
+sha256_mb_mgr_submit_avx512:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*16], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*16], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*16], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*16], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x16_avx512
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_submit_avx512
+no_sha256_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..b1bbc7002
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
@@ -0,0 +1,261 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_sse
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE 16*10
+%define _GPR_SAVE 8*5
+%define STACK_SPACE _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_sse(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_sse, function
+sha256_mb_mgr_submit_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ movdqa [rsp + 16*0], xmm6
+ movdqa [rsp + 16*1], xmm7
+ movdqa [rsp + 16*2], xmm8
+ movdqa [rsp + 16*3], xmm9
+ movdqa [rsp + 16*4], xmm10
+ movdqa [rsp + 16*5], xmm11
+ movdqa [rsp + 16*6], xmm12
+ movdqa [rsp + 16*7], xmm13
+ movdqa [rsp + 16*8], xmm14
+ movdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ movdqa xmm1, [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ movd [state + _args_digest + 4*lane + 4*16], xmm1
+ pextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1
+ pextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2
+ pextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 16*0]
+ movdqa xmm7, [rsp + 16*1]
+ movdqa xmm8, [rsp + 16*2]
+ movdqa xmm9, [rsp + 16*3]
+ movdqa xmm10, [rsp + 16*4]
+ movdqa xmm11, [rsp + 16*5]
+ movdqa xmm12, [rsp + 16*6]
+ movdqa xmm13, [rsp + 16*7]
+ movdqa xmm14, [rsp + 16*8]
+ movdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm
new file mode 100644
index 000000000..cb1dce641
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse_ni.asm
@@ -0,0 +1,301 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+extern sha256_mb_x4_sse
+extern sha256_ni_x2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE 16*10
+%define _GPR_SAVE 8*7
+%define STACK_SPACE _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_sse_ni(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha256_mb_mgr_submit_sse_ni, function
+sha256_mb_mgr_submit_sse_ni:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+ mov [rsp + _XMM_SAVE + 8*5], r13
+ mov [rsp + _XMM_SAVE + 8*6], r14
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ movdqa [rsp + 16*0], xmm6
+ movdqa [rsp + 16*1], xmm7
+ movdqa [rsp + 16*2], xmm8
+ movdqa [rsp + 16*3], xmm9
+ movdqa [rsp + 16*4], xmm10
+ movdqa [rsp + 16*5], xmm11
+ movdqa [rsp + 16*6], xmm12
+ movdqa [rsp + 16*7], xmm13
+ movdqa [rsp + 16*8], xmm14
+ movdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ movdqa xmm1, [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ movd [state + _args_digest + 4*lane + 4*16], xmm1
+ pextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1
+ pextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2
+ pextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+
+ cmp unused_lanes, 0xF32 ; we will process two jobs at the same time
+ jne return_null ; wait for another sha_ni job
+
+ ; compare with shani-sb threshold, if num_lanes_sse <= threshold, using shani func
+ %if SHA256_NI_SB_THRESHOLD_SSE >= 4 ; there are 4 lanes in sse mb
+ ; shani glue code
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+ ; lensN-len2=idx
+ sub lens0, len2
+ sub lens1, len2
+
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov r10, idx
+ or r10, 0x1000 ; sse has 4 lanes *4, r10b is idx, r10b2 is 16
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2, idx and nlane in r10
+ call sha256_ni_x2
+ ; state and idx are intact
+ %else
+ ; original mb code
+ cmp unused_lanes, 0xF
+ jne return_null
+
+ start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+ %endif
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 16*0]
+ movdqa xmm7, [rsp + 16*1]
+ movdqa xmm8, [rsp + 16*2]
+ movdqa xmm9, [rsp + 16*3]
+ movdqa xmm10, [rsp + 16*4]
+ movdqa xmm11, [rsp + 16*5]
+ movdqa xmm12, [rsp + 16*6]
+ movdqa xmm13, [rsp + 16*7]
+ movdqa xmm14, [rsp + 16*8]
+ movdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov r13, [rsp + _XMM_SAVE + 8*5]
+ mov r14, [rsp + _XMM_SAVE + 8*6]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
+%else
+ %ifidn __OUTPUT_FORMAT__, win64
+ global no_sha256_mb_mgr_submit_sse_ni
+ no_sha256_mb_mgr_submit_sse_ni:
+ %endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
new file mode 100644
index 000000000..768bfca78
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
@@ -0,0 +1,160 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ int ret;
+
+ printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Random buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ SHA256(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
new file mode 100644
index 000000000..adba77f3d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
@@ -0,0 +1,203 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+ int ret;
+
+ printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sha256_mb test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sha256_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
new file mode 100644
index 000000000..9535d80df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
@@ -0,0 +1,300 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SHA256_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA256_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+ int ret;
+
+ printf("multibinary_sha256_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_sha256 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha256_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha256_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha256_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ // Run sha256_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SHA256_BLOCK_SIZE +
+ SHA256_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SHA256_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem,
+ HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha256_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha256_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SHA256_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha256_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
new file mode 100644
index 000000000..8a5b5a9b2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
@@ -0,0 +1,241 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha256_mb.h"
+
+typedef uint32_t DigestSHA256[SHA256_DIGEST_NWORDS];
+
+#define MSGS 7
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+static uint8_t msg3[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<";
+static uint8_t msg4[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+static uint8_t msg5[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?";
+static uint8_t msg6[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+static uint8_t msg7[] = "";
+
+static DigestSHA256 expResultDigest1 = { 0x248D6A61, 0xD20638B8, 0xE5C02693, 0x0C3E6039,
+ 0xA33CE459, 0x64FF2167, 0xF6ECEDD4, 0x19DB06C1
+};
+
+static DigestSHA256 expResultDigest2 = { 0xD9C2E699, 0x586B948F, 0x4022C799, 0x4FFE14C6,
+ 0x3A4E8E31, 0x2EE2AEE1, 0xEBE51BED, 0x85705CFD
+};
+
+static DigestSHA256 expResultDigest3 = { 0xE3057651, 0x81295681, 0x7ECF1791, 0xFF9A1619,
+ 0xB2BC5CAD, 0x2AC00018, 0x92AE489C, 0x48DD10B3
+};
+
+static DigestSHA256 expResultDigest4 = { 0x0307DAA3, 0x7130A140, 0x270790F9, 0x95B71407,
+ 0x8EC752A6, 0x084EC1F3, 0xBD873D79, 0x3FF78383
+};
+
+static DigestSHA256 expResultDigest5 = { 0x679312F7, 0x2E18D599, 0x5F51BDC6, 0x4ED56AFD,
+ 0x9B5704D3, 0x4387E11C, 0xC2331089, 0x2CD45DAA
+};
+
+static DigestSHA256 expResultDigest6 = { 0x8B1767E9, 0x7BA7BBE5, 0xF9A6E8D9, 0x9996904F,
+ 0x3AF6562E, 0xA58AF438, 0x5D8D584B, 0x81C808CE
+};
+
+static DigestSHA256 expResultDigest7 = { 0xE3B0C442, 0x98FC1C14, 0x9AFBF4C8, 0x996FB924,
+ 0x27AE41E4, 0x649B934C, 0xA495991B, 0x7852B855
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+
+static uint32_t *expResultDigest[MSGS] = {
+ expResultDigest1, expResultDigest2, expResultDigest3,
+ expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7
+};
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sha256_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sha256_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sha256 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..51759d7a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
@@ -0,0 +1,129 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 4000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 20
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sha256_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha256_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha256" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sha256 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..235ec74a8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,132 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SHA256_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ uint32_t nlanes;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sha256_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha256_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb shortage tests
+ for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < nlanes; i++)
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+ HASH_ENTIRE);
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha256" TEST_TYPE_STR " with %d lanes: ", nlanes);
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < nlanes; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ }
+
+ printf("Multi-buffer sha256 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
new file mode 100644
index 000000000..f45669c6e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
@@ -0,0 +1,930 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA256 using SSE-256 / AVX512
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, 8*64, 64
+FIELD _rsp, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+
+%define IDX var1
+%define TBL var2
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
+
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+ vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddd T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, T1, %%WT ; T1 = T1 + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpaddd D, D, T1 ; D = D + T1
+
+ vprord H, A, 2 ; ROR_2(A)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vmovdqa32 TMP0, A
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+; This is supposed to be SKL optimized assuming:
+; vpternlog, vpaddd ports 5,8
+; vprord ports 1,8
+; However, vprord is only working on port 8
+;
+; Main processing loop per round
+; Get the msg schedule word 16 from the current, now unneccessary word
+%macro PROCESS_LOOP_00_47 5
+%define %%WT %1
+%define %%ROUND %2
+%define %%WTp1 %3
+%define %%WTp9 %4
+%define %%WTp14 %5
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ ;; For next value in msg schedule
+ ;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt
+
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, H, %%WT ; T1 = H + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP6 ; T1 = T1 + Kt
+ vprord H, A, 2 ; ROR_2(A)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vmovdqa32 TMP0, A
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpaddd D, D, T1 ; D = D + T1
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+
+ vmovdqa32 TMP6, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT %1
+%define %%OFFSET %2
+ mov inp0, [IN + (%%OFFSET*8)]
+ vmovups %%WT, [inp0+IDX]
+%endmacro
+
+align 64
+
+;; void sha256_mb_x16_avx512(SHA256_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha256_mb_x16_avx512)
+sha256_mb_x16_avx512:
+ endbranch
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _rsp], rax
+ lea TBL, [TABLE]
+
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*64]
+ vmovups B, [DIGEST + 1*64]
+ vmovups C, [DIGEST + 2*64]
+ vmovups D, [DIGEST + 3*64]
+ vmovups E, [DIGEST + 4*64]
+ vmovups F, [DIGEST + 5*64]
+ vmovups G, [DIGEST + 6*64]
+ vmovups H, [DIGEST + 7*64]
+
+ ; Do we need to transpose digests???
+ ; SHA1 does not, but SHA256 has been
+
+ xor IDX, IDX
+
+ ;; Read in first block of input data
+ ;; Transpose input data
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+ vmovups W0,[inp0+IDX]
+ vmovups W1,[inp1+IDX]
+ vmovups W2,[inp2+IDX]
+ vmovups W3,[inp3+IDX]
+ vmovups W4,[inp4+IDX]
+ vmovups W5,[inp5+IDX]
+ vmovups W6,[inp6+IDX]
+ vmovups W7,[inp7+IDX]
+
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+
+ vmovups W8, [inp0+IDX]
+ vmovups W9, [inp1+IDX]
+ vmovups W10,[inp2+IDX]
+ vmovups W11,[inp3+IDX]
+ vmovups W12,[inp4+IDX]
+ vmovups W13,[inp5+IDX]
+ vmovups W14,[inp6+IDX]
+ vmovups W15,[inp7+IDX]
+
+
+lloop:
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ vmovdqa32 TMP3, [TBL] ; First K
+
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 64
+
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first 48 rounds
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+ ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 48
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+ ; Check is this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+ ; Process last 16 rounds
+ ; Read in next block msg data for use in first 16 words of msg sched
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ jmp lloop
+
+lastLoop:
+ ; Process last 16 rounds
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovups [DIGEST + 0*64], A
+ vmovups [DIGEST + 1*64], B
+ vmovups [DIGEST + 2*64], C
+ vmovups [DIGEST + 3*64], D
+ vmovups [DIGEST + 4*64], E
+ vmovups [DIGEST + 5*64], F
+ vmovups [DIGEST + 6*64], G
+ vmovups [DIGEST + 7*64], H
+
+
+ mov rsp, [rsp + _rsp]
+ ret
+
+ section .data
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_x16_avx512
+no_sha256_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
new file mode 100644
index 000000000..7f8f8829b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
@@ -0,0 +1,431 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA256 using AVX
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE K256_4_MB
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ4*(%%i&0xf) + rsp], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE 8*SZ4
+%define DATA 16*SZ4
+%define ALIGNMENT 1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define VMOVPS vmovups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define IDX rax
+%define ROUND rbx
+%define TBL r12
+
+;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 : arg1 : pointer args (only 4 of the 8 lanes used)
+;; arg 2 : arg2 : size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+mk_global sha256_mb_x4_avx, function, internal
+align 32
+sha256_mb_x4_avx:
+ endbranch
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ vmovdqa a,[arg1+0*SZ4]
+ vmovdqa b,[arg1+1*SZ4]
+ vmovdqa c,[arg1+2*SZ4]
+ vmovdqa d,[arg1+3*SZ4]
+ vmovdqa e,[arg1+4*SZ4]
+ vmovdqa f,[arg1+5*SZ4]
+ vmovdqa g,[arg1+6*SZ4]
+ vmovdqa h,[arg1+7*SZ4]
+
+ lea TBL,[TABLE]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ4], a
+ vmovdqa [rsp + _DIGEST + 1*SZ4], b
+ vmovdqa [rsp + _DIGEST + 2*SZ4], c
+ vmovdqa [rsp + _DIGEST + 3*SZ4], d
+ vmovdqa [rsp + _DIGEST + 4*SZ4], e
+ vmovdqa [rsp + _DIGEST + 5*SZ4], f
+ vmovdqa [rsp + _DIGEST + 6*SZ4], g
+ vmovdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPS TT2,[inp0+IDX+i*16]
+ VMOVPS TT1,[inp1+IDX+i*16]
+ VMOVPS TT4,[inp2+IDX+i*16]
+ VMOVPS TT3,[inp3+IDX+i*16]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+ vpshufb TT2, TT2, TMP
+ vpshufb TT3, TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddd a, a, [rsp + _DIGEST + 0*SZ4]
+ vpaddd b, b, [rsp + _DIGEST + 1*SZ4]
+ vpaddd c, c, [rsp + _DIGEST + 2*SZ4]
+ vpaddd d, d, [rsp + _DIGEST + 3*SZ4]
+ vpaddd e, e, [rsp + _DIGEST + 4*SZ4]
+ vpaddd f, f, [rsp + _DIGEST + 5*SZ4]
+ vpaddd g, g, [rsp + _DIGEST + 6*SZ4]
+ vpaddd h, h, [rsp + _DIGEST + 7*SZ4]
+
+
+ sub arg2, 1
+ jne lloop
+
+ ; write digests out
+ vmovdqa [arg1+0*SZ4],a
+ vmovdqa [arg1+1*SZ4],b
+ vmovdqa [arg1+2*SZ4],c
+ vmovdqa [arg1+3*SZ4],d
+ vmovdqa [arg1+4*SZ4],e
+ vmovdqa [arg1+5*SZ4],f
+ vmovdqa [arg1+6*SZ4],g
+ vmovdqa [arg1+7*SZ4],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [arg1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [arg1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [arg1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+ ret
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
new file mode 100644
index 000000000..2d349abbc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
@@ -0,0 +1,426 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA256 using SSE
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE K256_4_MB
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%reg, %%imm
+ pslld %%tmp, (32-(%%imm))
+ por %%reg, %%tmp
+%endmacro
+
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+
+
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORD a0, (11-6) ; sig1: a0 = (e >> 5)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORD a1, 25 ; sig1: a1 = (e >> 25)
+ movdqa [SZ4*(%%i&0xf) + rsp],%%T1
+ paddd %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ paddd h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORD a2, (13-2) ; sig0: a2 = (a >> 11)
+ paddd h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORD a1, 22 ; sig0: a1 = (a >> 22)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddd h, a0
+
+ paddd d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddd h, a1 ; h = h + ch + W + K + maj
+ paddd h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+
+ movdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ movdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ movdqa a0, %%T1
+ PRORD %%T1, 18-7
+ movdqa a2, a1
+ PRORD a1, 19-17
+ pxor %%T1, a0
+ PRORD %%T1, 7
+ pxor a1, a2
+ PRORD a1, 17
+ psrld a0, 3
+ pxor %%T1, a0
+ psrld a2, 10
+ pxor a1, a2
+ paddd %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ paddd a1, [SZ4*((%%i-7)&0xf) + rsp]
+ paddd %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE 8*SZ4
+%define DATA 16*SZ4
+%define ALIGNMENT 1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define MOVPS movups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define IDX rax
+%define ROUND rbx
+%define TBL r12
+
+;; void sha256_mb_x4_sse(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 : pointer args (only 4 of the 8 lanes used)
+;; arg 2 : size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+
+mk_global sha256_mb_x4_sse, function, internal
+align 32
+sha256_mb_x4_sse:
+ endbranch
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ movdqa a,[arg1+0*SZ4]
+ movdqa b,[arg1+1*SZ4]
+ movdqa c,[arg1+2*SZ4]
+ movdqa d,[arg1+3*SZ4]
+ movdqa e,[arg1+4*SZ4]
+ movdqa f,[arg1+5*SZ4]
+ movdqa g,[arg1+6*SZ4]
+ movdqa h,[arg1+7*SZ4]
+
+ lea TBL,[TABLE]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ movdqa [rsp + _DIGEST + 0*SZ4], a
+ movdqa [rsp + _DIGEST + 1*SZ4], b
+ movdqa [rsp + _DIGEST + 2*SZ4], c
+ movdqa [rsp + _DIGEST + 3*SZ4], d
+ movdqa [rsp + _DIGEST + 4*SZ4], e
+ movdqa [rsp + _DIGEST + 5*SZ4], f
+ movdqa [rsp + _DIGEST + 6*SZ4], g
+ movdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ MOVPS TT2,[inp0+IDX+i*16]
+ MOVPS TT1,[inp1+IDX+i*16]
+ MOVPS TT4,[inp2+IDX+i*16]
+ MOVPS TT3,[inp3+IDX+i*16]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ pshufb TT0, TMP
+ pshufb TT1, TMP
+ pshufb TT2, TMP
+ pshufb TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ paddd a, [rsp + _DIGEST + 0*SZ4]
+ paddd b, [rsp + _DIGEST + 1*SZ4]
+ paddd c, [rsp + _DIGEST + 2*SZ4]
+ paddd d, [rsp + _DIGEST + 3*SZ4]
+ paddd e, [rsp + _DIGEST + 4*SZ4]
+ paddd f, [rsp + _DIGEST + 5*SZ4]
+ paddd g, [rsp + _DIGEST + 6*SZ4]
+ paddd h, [rsp + _DIGEST + 7*SZ4]
+
+
+ sub arg2, 1
+ jne lloop
+
+ ; write digests out
+ movdqa [arg1+0*SZ4],a
+ movdqa [arg1+1*SZ4],b
+ movdqa [arg1+2*SZ4],c
+ movdqa [arg1+3*SZ4],d
+ movdqa [arg1+4*SZ4],e
+ movdqa [arg1+5*SZ4],f
+ movdqa [arg1+6*SZ4],g
+ movdqa [arg1+7*SZ4],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [arg1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [arg1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [arg1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+ ret
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
new file mode 100644
index 000000000..dbd9db1b8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
@@ -0,0 +1,620 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SHA256 using SSE-256 / AVX2
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rsi
+ %define reg4 rdi
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL reg3
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg4
+
+; ymm0 a
+; ymm1 b
+; ymm2 c
+; ymm3 d
+; ymm4 e
+; ymm5 f
+; ymm6 g TMP0
+; ymm7 h TMP1
+; ymm8 T1 TT0
+; ymm9 TT1
+; ymm10 TT2
+; ymm11 TT3
+; ymm12 a0 TT4
+; ymm13 a1 TT5
+; ymm14 a2 TT6
+; ymm15 TMP TT7
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define T1 ymm8
+
+%define a0 ymm12
+%define a1 ymm13
+%define a2 ymm14
+%define TMP ymm15
+
+%define TMP0 ymm6
+%define TMP1 ymm7
+
+%define TT0 ymm8
+%define TT1 ymm9
+%define TT2 ymm10
+%define TT3 ymm11
+%define TT4 ymm12
+%define TT5 ymm13
+%define TT6 ymm14
+%define TT7 ymm15
+
+%define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 64*SZ8
+%define PTR_SZ 8
+%define SHA256_DIGEST_WORD_SIZE 4
+%define MAX_SHA256_LANES 8
+%define NUM_SHA256_DIGEST_WORDS 8
+%define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+ .data resb 16*SZ8
+ .digest resb 8*SZ8
+ .ytmp resb 4*SZ8
+ .rsp resb 8
+endstruc
+%define FRAMESZ stack_frame_size
+%define _DIGEST stack_frame.digest
+%define _YTMP stack_frame.ytmp
+%define _RSP_SAVE stack_frame.rsp
+
+%define YTMP0 rsp + _YTMP + 0*SZ8
+%define YTMP1 rsp + _YTMP + 1*SZ8
+%define YTMP2 rsp + _YTMP + 2*SZ8
+%define YTMP3 rsp + _YTMP + 3*SZ8
+
+%define VMOVPS vmovups
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ8 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp]
+ vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes);
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE : size of input in blocks
+mk_global sha256_mb_x8_avx2, function, internal
+align 16
+sha256_mb_x8_avx2:
+ endbranch
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ ; save rsp, allocate 32-byte aligned for local variables
+ mov IDX, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~31
+ mov [rsp + _RSP_SAVE], IDX
+
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu d,[STATE + 3*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu e,[STATE + 4*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu f,[STATE + 5*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu g,[STATE + 6*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE]
+
+ lea TBL,[K256_8_MB]
+
+ ;; load the address of each of the 4 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
+ mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
+ mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
+ mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
+ mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
+ mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
+ mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
+ mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ8], a
+ vmovdqa [rsp + _DIGEST + 1*SZ8], b
+ vmovdqa [rsp + _DIGEST + 2*SZ8], c
+ vmovdqa [rsp + _DIGEST + 3*SZ8], d
+ vmovdqa [rsp + _DIGEST + 4*SZ8], e
+ vmovdqa [rsp + _DIGEST + 5*SZ8], f
+ vmovdqa [rsp + _DIGEST + 6*SZ8], g
+ vmovdqa [rsp + _DIGEST + 7*SZ8], h
+%assign i 0
+%rep 2
+ VMOVPS TT0,[inp0+IDX+i*32]
+ VMOVPS TT1,[inp1+IDX+i*32]
+ VMOVPS TT2,[inp2+IDX+i*32]
+ VMOVPS TT3,[inp3+IDX+i*32]
+ VMOVPS TT4,[inp4+IDX+i*32]
+ VMOVPS TT5,[inp5+IDX+i*32]
+ VMOVPS TT6,[inp6+IDX+i*32]
+ VMOVPS TT7,[inp7+IDX+i*32]
+ vmovdqa [YTMP0], g
+ vmovdqa [YTMP1], h
+ TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
+ vmovdqa TMP1, [PSHUFFLE_BYTE_FLIP_MASK]
+ vmovdqa g, [YTMP0]
+ vpshufb TT0, TT0, TMP1
+ vpshufb TT1, TT1, TMP1
+ vpshufb TT2, TT2, TMP1
+ vpshufb TT3, TT3, TMP1
+ vpshufb TT4, TT4, TMP1
+ vpshufb TT5, TT5, TMP1
+ vpshufb TT6, TT6, TMP1
+ vpshufb TT7, TT7, TMP1
+ vmovdqa h, [YTMP1]
+ vmovdqa [YTMP0], TT4
+ vmovdqa [YTMP1], TT5
+ vmovdqa [YTMP2], TT6
+ vmovdqa [YTMP3], TT7
+ ROUND_00_15 TT0,(i*8+0)
+ vmovdqa TT0, [YTMP0]
+ ROUND_00_15 TT1,(i*8+1)
+ vmovdqa TT1, [YTMP1]
+ ROUND_00_15 TT2,(i*8+2)
+ vmovdqa TT2, [YTMP2]
+ ROUND_00_15 TT3,(i*8+3)
+ vmovdqa TT3, [YTMP3]
+ ROUND_00_15 TT0,(i*8+4)
+ ROUND_00_15 TT1,(i*8+5)
+ ROUND_00_15 TT2,(i*8+6)
+ ROUND_00_15 TT3,(i*8+7)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+%assign i (i*8)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddd a, a, [rsp + _DIGEST + 0*SZ8]
+ vpaddd b, b, [rsp + _DIGEST + 1*SZ8]
+ vpaddd c, c, [rsp + _DIGEST + 2*SZ8]
+ vpaddd d, d, [rsp + _DIGEST + 3*SZ8]
+ vpaddd e, e, [rsp + _DIGEST + 4*SZ8]
+ vpaddd f, f, [rsp + _DIGEST + 5*SZ8]
+ vpaddd g, g, [rsp + _DIGEST + 6*SZ8]
+ vpaddd h, h, [rsp + _DIGEST + 7*SZ8]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqu [STATE + 0*SHA256_DIGEST_ROW_SIZE],a
+ vmovdqu [STATE + 1*SHA256_DIGEST_ROW_SIZE],b
+ vmovdqu [STATE + 2*SHA256_DIGEST_ROW_SIZE],c
+ vmovdqu [STATE + 3*SHA256_DIGEST_ROW_SIZE],d
+ vmovdqu [STATE + 4*SHA256_DIGEST_ROW_SIZE],e
+ vmovdqu [STATE + 5*SHA256_DIGEST_ROW_SIZE],f
+ vmovdqu [STATE + 6*SHA256_DIGEST_ROW_SIZE],g
+ vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _args_data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [STATE + _args_data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [STATE + _args_data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [STATE + _args_data_ptr + 3*8], inp3
+ add inp4, IDX
+ mov [STATE + _args_data_ptr + 4*8], inp4
+ add inp5, IDX
+ mov [STATE + _args_data_ptr + 5*8], inp5
+ add inp6, IDX
+ mov [STATE + _args_data_ptr + 6*8], inp6
+ add inp7, IDX
+ mov [STATE + _args_data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ mov rsp, [rsp + _RSP_SAVE]
+ ret
+
+section .data
+align 64
+K256_8_MB:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
new file mode 100644
index 000000000..af54f7cc3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
@@ -0,0 +1,125 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha256_ctx_mgr_init_sse
+extern sha256_ctx_mgr_submit_sse
+extern sha256_ctx_mgr_flush_sse
+
+extern sha256_ctx_mgr_init_avx
+extern sha256_ctx_mgr_submit_avx
+extern sha256_ctx_mgr_flush_avx
+
+extern sha256_ctx_mgr_init_avx2
+extern sha256_ctx_mgr_submit_avx2
+extern sha256_ctx_mgr_flush_avx2
+
+extern sha256_ctx_mgr_init_base
+extern sha256_ctx_mgr_submit_base
+extern sha256_ctx_mgr_flush_base
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha256_ctx_mgr_init_avx512
+ extern sha256_ctx_mgr_submit_avx512
+ extern sha256_ctx_mgr_flush_avx512
+%endif
+
+%ifdef HAVE_AS_KNOWS_SHANI
+ extern sha256_ctx_mgr_init_sse_ni
+ extern sha256_ctx_mgr_submit_sse_ni
+ extern sha256_ctx_mgr_flush_sse_ni
+%endif
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ %ifdef HAVE_AS_KNOWS_SHANI
+ extern sha256_ctx_mgr_init_avx512_ni
+ extern sha256_ctx_mgr_submit_avx512_ni
+ extern sha256_ctx_mgr_flush_avx512_ni
+ %endif
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha256_ctx_mgr_init
+mbin_interface sha256_ctx_mgr_submit
+mbin_interface sha256_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6's extension through replacing base by sse version
+ %ifdef HAVE_AS_KNOWS_SHANI
+ mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \
+ sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \
+ sha256_ctx_mgr_init_avx512, sha256_ctx_mgr_init_sse_ni, sha256_ctx_mgr_init_avx512_ni
+ mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \
+ sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \
+ sha256_ctx_mgr_submit_avx512, sha256_ctx_mgr_submit_sse_ni, sha256_ctx_mgr_submit_avx512_ni
+ mbin_dispatch_base_to_avx512_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \
+ sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \
+ sha256_ctx_mgr_flush_avx512, sha256_ctx_mgr_flush_sse_ni, sha256_ctx_mgr_flush_avx512_ni
+ %else
+ mbin_dispatch_init6 sha256_ctx_mgr_init, sha256_ctx_mgr_init_base, \
+ sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, \
+ sha256_ctx_mgr_init_avx512
+ mbin_dispatch_init6 sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_base, \
+ sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, \
+ sha256_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_base, \
+ sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, \
+ sha256_ctx_mgr_flush_avx512
+ %endif
+%else
+ %ifdef HAVE_AS_KNOWS_SHANI
+ mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \
+ sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, sha256_ctx_mgr_init_sse_ni
+ mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \
+ sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, sha256_ctx_mgr_submit_sse_ni
+ mbin_dispatch_sse_to_avx2_shani sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \
+ sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, sha256_ctx_mgr_flush_sse_ni
+ %else
+ mbin_dispatch_init sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, \
+ sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2
+ mbin_dispatch_init sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, \
+ sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2
+ mbin_dispatch_init sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, \
+ sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2
+ %endif
+%endif
+
+;;; func core, ver, snum
+slversion sha256_ctx_mgr_init, 00, 04, 0160
+slversion sha256_ctx_mgr_submit, 00, 04, 0161
+slversion sha256_ctx_mgr_flush, 00, 04, 0162
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm
new file mode 100644
index 000000000..25fc9ce16
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x1.asm
@@ -0,0 +1,361 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+%define MSG xmm0
+%define STATE0 xmm1
+%define STATE1 xmm2
+%define MSGTMP0 xmm3
+%define MSGTMP1 xmm4
+%define MSGTMP2 xmm5
+%define MSGTMP3 xmm6
+%define MSGTMP4 xmm7
+
+%define SHUF_MASK xmm8
+
+%define ABEF_SAVE xmm9
+%define CDGH_SAVE xmm10
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+%define IDX r8 ; local variable -- consistent with caller
+%define DPTR r11 ; local variable -- input buffer pointer
+%define TMP r9 ; local variable -- assistant to address digest
+%define TBL rax
+;%define TMP2 r8 ; local variable -- assistant to address digest
+align 32
+
+; void sha256_ni_x1(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r11, xmm0-xmm10
+;
+mk_global sha256_ni_x1, function, internal
+sha256_ni_x1:
+ endbranch
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz backto_mgr
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ lea TMP, [MGR + 4*IDX]
+ ;; Initialize digest
+ ;; digests -> ABEF(state0), CDGH(state1)
+ pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A
+ pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B
+ pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D
+ pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E
+ pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F
+ pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H
+
+ movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+ lea TBL, [TABLE]
+
+ ;; Load input pointers
+ mov DPTR, [MGR + _data_ptr + IDX*8]
+ ;; nblk is used to indicate data end
+ add NBLK, DPTR
+
+lloop:
+ ; /* Save hash values for addition after rounds */
+ movdqa ABEF_SAVE, STATE0
+ movdqa CDGH_SAVE, STATE1
+
+ ; /* Rounds 0-3 */
+ movdqu MSG, [DPTR + 0*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP0, MSG
+ paddd MSG, [TBL + 0*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ ; /* Rounds 4-7 */
+ movdqu MSG, [DPTR + 1*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP1, MSG
+ paddd MSG, [TBL + 1*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ ; /* Rounds 8-11 */
+ movdqu MSG, [DPTR + 2*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP2, MSG
+ paddd MSG, [TBL + 2*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ ; /* Rounds 12-15 */
+ movdqu MSG, [DPTR + 3*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP3, MSG
+ paddd MSG, [TBL + 3*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ ; /* Rounds 16-19 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 4*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ ; /* Rounds 20-23 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 5*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ ; /* Rounds 24-27 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 6*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ ; /* Rounds 28-31 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 7*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ ; /* Rounds 32-35 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 8*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ ; /* Rounds 36-39 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 9*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ ; /* Rounds 40-43 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 10*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ ; /* Rounds 44-47 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 11*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ ; /* Rounds 48-51 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 12*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ ; /* Rounds 52-55 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 13*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ ; /* Rounds 56-59 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 14*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ ; /* Rounds 60-63 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 15*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ ; /* Add current hash values with previously saved */
+ paddd STATE0, ABEF_SAVE
+ paddd STATE1, CDGH_SAVE
+
+ ; Increment data pointer and loop if more to process
+ add DPTR, 64
+ cmp DPTR, NBLK
+ jne lloop
+
+ ; write out digests
+ lea TMP, [MGR + 4*IDX]
+ ;; ABEF(state0), CDGH(state1) -> digests
+ pextrd [TMP + 0*NLANX4], STATE0, 3 ; A
+ pextrd [TMP + 1*NLANX4], STATE0, 2 ; B
+ pextrd [TMP + 2*NLANX4], STATE1, 3 ; C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMP + 1*NLANX4], STATE1, 2 ; D
+ pextrd [TMP + 2*NLANX4], STATE0, 1 ; E
+ pextrd [TMP + 4*NLANX4], STATE1, 1 ; G
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pextrd [TMP + 2*NLANX4], STATE0, 0 ; F
+ pextrd [TMP + 4*NLANX4], STATE1, 0 ; H
+
+ ; update input pointers
+ mov [MGR + _data_ptr + IDX*8], DPTR
+
+backto_mgr:
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ ret
+
+
+section .data align=16
+PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_ni_x1
+no_sha256_ni_x1:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm
new file mode 100644
index 000000000..74cfc93b6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ni_x2.asm
@@ -0,0 +1,574 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_SHANI
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define FRAMESZ 64 ; space for ABCDE
+%define RSPSAVE rax
+
+%define MSG xmm0
+%define STATE0 xmm1
+%define STATE1 xmm2
+%define MSGTMP0 xmm3
+%define MSGTMP1 xmm4
+%define MSGTMP2 xmm5
+%define MSGTMP3 xmm6
+%define MSGTMP4 xmm7
+
+%define STATE0b xmm8
+%define STATE1b xmm9
+%define MSGTMP0b xmm10
+%define MSGTMP1b xmm11
+%define MSGTMP2b xmm12
+%define MSGTMP3b xmm13
+%define MSGTMP4b xmm14
+
+%define SHUF_MASK xmm15
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0
+%define NBLK arg1
+%define NLANX4 r10 ; consistent with caller
+%define IDX r8 ; local variable -- consistent with caller
+%define DPTR r11 ; local variable -- input buffer pointer
+%define DPTRb r12
+%define TMP r9 ; local variable -- assistant to address digest
+%define TBL r13
+%define TMPb r14 ; local variable -- assistant to address digest
+align 32
+
+; void sha256_ni_x2(SHA256_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: rax, r9~r14, xmm0-xmm15
+;
+mk_global sha256_ni_x2, function, internal
+sha256_ni_x2:
+ endbranch
+ mov RSPSAVE, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~0xF ; Align 16Bytes downward
+
+ shl NBLK, 6 ; transform blk amount into bytes
+ jz backto_mgr
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ lea TMP, [MGR + 4*0]
+ lea TMPb, [MGR + 4*1]
+
+ ;; Initialize digest
+ ;; digests -> ABEF(state0), CDGH(state1)
+ pinsrd STATE0, [TMP + 0*NLANX4], 3 ; A
+ pinsrd STATE0, [TMP + 1*NLANX4], 2 ; B
+ pinsrd STATE1, [TMP + 2*NLANX4], 3 ; C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd STATE1, [TMP + 1*NLANX4], 2 ; D
+ pinsrd STATE0, [TMP + 2*NLANX4], 1 ; E
+ pinsrd STATE1, [TMP + 4*NLANX4], 1 ; G
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pinsrd STATE0, [TMP + 2*NLANX4], 0 ; F
+ pinsrd STATE1, [TMP + 4*NLANX4], 0 ; H
+
+ pinsrd STATE0b, [TMPb + 0*NLANX4], 3 ; A
+ pinsrd STATE0b, [TMPb + 1*NLANX4], 2 ; B
+ pinsrd STATE1b, [TMPb + 2*NLANX4], 3 ; C
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pinsrd STATE1b, [TMPb + 1*NLANX4], 2 ; D
+ pinsrd STATE0b, [TMPb + 2*NLANX4], 1 ; E
+ pinsrd STATE1b, [TMPb + 4*NLANX4], 1 ; G
+ lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pinsrd STATE0b, [TMPb + 2*NLANX4], 0 ; F
+ pinsrd STATE1b, [TMPb + 4*NLANX4], 0 ; H
+
+ movdqa SHUF_MASK, [PSHUFFLE_SHANI_MASK]
+ lea TBL, [TABLE]
+
+ ;; Load input pointers
+ mov DPTR, [MGR + _data_ptr + 8*0]
+ mov DPTRb,[MGR + _data_ptr + 8*1]
+ ;; nblk is used to indicate data end
+ add NBLK, DPTR
+
+lloop:
+ ; /* Save hash values for addition after rounds */
+ movdqa [rsp + 0*16], STATE0
+ movdqa [rsp + 1*16], STATE1
+
+ movdqa [rsp + 2*16], STATE0b
+ movdqa [rsp + 3*16], STATE1b
+
+ ; /* Rounds 0-3 */
+ movdqu MSG, [DPTR + 0*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP0, MSG
+ paddd MSG, [TBL + 0*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ movdqu MSG, [DPTRb + 0*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP0b, MSG
+ paddd MSG, [TBL + 0*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+
+ ; /* Rounds 4-7 */
+ movdqu MSG, [DPTR + 1*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP1, MSG
+ paddd MSG, [TBL + 1*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ movdqu MSG, [DPTRb + 1*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP1b, MSG
+ paddd MSG, [TBL + 1*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP0b, MSGTMP1b
+
+ ; /* Rounds 8-11 */
+ movdqu MSG, [DPTR + 2*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP2, MSG
+ paddd MSG, [TBL + 2*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ movdqu MSG, [DPTRb + 2*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP2b, MSG
+ paddd MSG, [TBL + 2*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP1b, MSGTMP2b
+
+ ; /* Rounds 12-15 */
+ movdqu MSG, [DPTR + 3*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP3, MSG
+ paddd MSG, [TBL + 3*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ movdqu MSG, [DPTRb + 3*16]
+ pshufb MSG, SHUF_MASK
+ movdqa MSGTMP3b, MSG
+ paddd MSG, [TBL + 3*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP3b
+ palignr MSGTMP4b, MSGTMP2b, 4
+ paddd MSGTMP0b, MSGTMP4b
+ sha256msg2 MSGTMP0b, MSGTMP3b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP2b, MSGTMP3b
+
+ ; /* Rounds 16-19 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 4*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ movdqa MSG, MSGTMP0b
+ paddd MSG, [TBL + 4*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP0b
+ palignr MSGTMP4b, MSGTMP3b, 4
+ paddd MSGTMP1b, MSGTMP4b
+ sha256msg2 MSGTMP1b, MSGTMP0b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP3b, MSGTMP0b
+
+ ; /* Rounds 20-23 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 5*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ movdqa MSG, MSGTMP1b
+ paddd MSG, [TBL + 5*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP1b
+ palignr MSGTMP4b, MSGTMP0b, 4
+ paddd MSGTMP2b, MSGTMP4b
+ sha256msg2 MSGTMP2b, MSGTMP1b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP0b, MSGTMP1b
+
+ ; /* Rounds 24-27 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 6*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ movdqa MSG, MSGTMP2b
+ paddd MSG, [TBL + 6*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP2b
+ palignr MSGTMP4b, MSGTMP1b, 4
+ paddd MSGTMP3b, MSGTMP4b
+ sha256msg2 MSGTMP3b, MSGTMP2b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP1b, MSGTMP2b
+
+ ; /* Rounds 28-31 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 7*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ movdqa MSG, MSGTMP3b
+ paddd MSG, [TBL + 7*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP3b
+ palignr MSGTMP4b, MSGTMP2b, 4
+ paddd MSGTMP0b, MSGTMP4b
+ sha256msg2 MSGTMP0b, MSGTMP3b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP2b, MSGTMP3b
+
+ ; /* Rounds 32-35 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 8*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ movdqa MSG, MSGTMP0b
+ paddd MSG, [TBL + 8*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP0b
+ palignr MSGTMP4b, MSGTMP3b, 4
+ paddd MSGTMP1b, MSGTMP4b
+ sha256msg2 MSGTMP1b, MSGTMP0b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP3b, MSGTMP0b
+
+ ; /* Rounds 36-39 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 9*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP0, MSGTMP1
+
+ movdqa MSG, MSGTMP1b
+ paddd MSG, [TBL + 9*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP1b
+ palignr MSGTMP4b, MSGTMP0b, 4
+ paddd MSGTMP2b, MSGTMP4b
+ sha256msg2 MSGTMP2b, MSGTMP1b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP0b, MSGTMP1b
+
+ ; /* Rounds 40-43 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 10*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP1, MSGTMP2
+
+ movdqa MSG, MSGTMP2b
+ paddd MSG, [TBL + 10*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP2b
+ palignr MSGTMP4b, MSGTMP1b, 4
+ paddd MSGTMP3b, MSGTMP4b
+ sha256msg2 MSGTMP3b, MSGTMP2b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP1b, MSGTMP2b
+
+ ; /* Rounds 44-47 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 11*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP3
+ palignr MSGTMP4, MSGTMP2, 4
+ paddd MSGTMP0, MSGTMP4
+ sha256msg2 MSGTMP0, MSGTMP3
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP2, MSGTMP3
+
+ movdqa MSG, MSGTMP3b
+ paddd MSG, [TBL + 11*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP3b
+ palignr MSGTMP4b, MSGTMP2b, 4
+ paddd MSGTMP0b, MSGTMP4b
+ sha256msg2 MSGTMP0b, MSGTMP3b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP2b, MSGTMP3b
+
+ ; /* Rounds 48-51 */
+ movdqa MSG, MSGTMP0
+ paddd MSG, [TBL + 12*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP0
+ palignr MSGTMP4, MSGTMP3, 4
+ paddd MSGTMP1, MSGTMP4
+ sha256msg2 MSGTMP1, MSGTMP0
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+ sha256msg1 MSGTMP3, MSGTMP0
+
+ movdqa MSG, MSGTMP0b
+ paddd MSG, [TBL + 12*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP0b
+ palignr MSGTMP4b, MSGTMP3b, 4
+ paddd MSGTMP1b, MSGTMP4b
+ sha256msg2 MSGTMP1b, MSGTMP0b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+ sha256msg1 MSGTMP3b, MSGTMP0b
+
+ ; /* Rounds 52-55 */
+ movdqa MSG, MSGTMP1
+ paddd MSG, [TBL + 13*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP1
+ palignr MSGTMP4, MSGTMP0, 4
+ paddd MSGTMP2, MSGTMP4
+ sha256msg2 MSGTMP2, MSGTMP1
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ movdqa MSG, MSGTMP1b
+ paddd MSG, [TBL + 13*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP1b
+ palignr MSGTMP4b, MSGTMP0b, 4
+ paddd MSGTMP2b, MSGTMP4b
+ sha256msg2 MSGTMP2b, MSGTMP1b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+
+ ; /* Rounds 56-59 */
+ movdqa MSG, MSGTMP2
+ paddd MSG, [TBL + 14*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ movdqa MSGTMP4, MSGTMP2
+ palignr MSGTMP4, MSGTMP1, 4
+ paddd MSGTMP3, MSGTMP4
+ sha256msg2 MSGTMP3, MSGTMP2
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ movdqa MSG, MSGTMP2b
+ paddd MSG, [TBL + 14*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ movdqa MSGTMP4b, MSGTMP2b
+ palignr MSGTMP4b, MSGTMP1b, 4
+ paddd MSGTMP3b, MSGTMP4b
+ sha256msg2 MSGTMP3b, MSGTMP2b
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+
+ ; /* Rounds 60-63 */
+ movdqa MSG, MSGTMP3
+ paddd MSG, [TBL + 15*16]
+ sha256rnds2 STATE1, STATE0, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0, STATE1, MSG
+
+ movdqa MSG, MSGTMP3b
+ paddd MSG, [TBL + 15*16]
+ sha256rnds2 STATE1b, STATE0b, MSG
+ pshufd MSG, MSG, 0x0E
+ sha256rnds2 STATE0b, STATE1b, MSG
+
+ ; /* Add current hash values with previously saved */
+ paddd STATE0, [rsp + 0*16]
+ paddd STATE1, [rsp + 1*16]
+
+ paddd STATE0b, [rsp + 2*16]
+ paddd STATE1b, [rsp + 3*16]
+
+ ; Increment data pointer and loop if more to process
+ add DPTR, 64
+ add DPTRb, 64
+ cmp DPTR, NBLK
+ jne lloop
+
+ ; write out digests
+ lea TMP, [MGR + 4*0]
+ ;; ABEF(state0), CDGH(state1) -> digests
+ pextrd [TMP + 0*NLANX4], STATE0, 3 ; A
+ pextrd [TMP + 1*NLANX4], STATE0, 2 ; B
+ pextrd [TMP + 2*NLANX4], STATE1, 3 ; C
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMP + 1*NLANX4], STATE1, 2 ; D
+ pextrd [TMP + 2*NLANX4], STATE0, 1 ; E
+ pextrd [TMP + 4*NLANX4], STATE1, 1 ; G
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pextrd [TMP + 2*NLANX4], STATE0, 0 ; F
+ pextrd [TMP + 4*NLANX4], STATE1, 0 ; H
+
+ lea TMPb, [MGR + 4*1]
+ ;; ABEF(state0), CDGH(state1) -> digests
+ pextrd [TMPb + 0*NLANX4], STATE0b, 3 ; A
+ pextrd [TMPb + 1*NLANX4], STATE0b, 2 ; B
+ pextrd [TMPb + 2*NLANX4], STATE1b, 3 ; C
+ lea TMPb, [TMPb + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ pextrd [TMPb + 1*NLANX4], STATE1b, 2 ; D
+ pextrd [TMPb + 2*NLANX4], STATE0b, 1 ; E
+ pextrd [TMPb + 4*NLANX4], STATE1b, 1 ; G
+ lea TMPb, [TMPb + 1*NLANX4] ; MGR + 4*IDX + 6*NLANX4
+ pextrd [TMPb + 2*NLANX4], STATE0b, 0 ; F
+ pextrd [TMPb + 4*NLANX4], STATE1b, 0 ; H
+
+ ; update input pointers
+ mov [MGR + _data_ptr + 0*8], DPTR
+ mov [MGR + _data_ptr + 1*8], DPTRb
+
+backto_mgr:
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ mov rsp, RSPSAVE
+
+ ret
+
+section .data align=16
+PSHUFFLE_SHANI_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+TABLE: dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_ni_x2
+no_sha256_ni_x2:
+%endif
+%endif ; HAVE_AS_KNOWS_SHANI
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm
new file mode 100644
index 000000000..fc13ec279
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_opt_x1.asm
@@ -0,0 +1,567 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Implement fast SHA-256 with SSSE3 instructions. (x86_64)
+;
+; Copyright (C) 2013 Intel Corporation.
+;
+; Authors:
+; James Guilford <james.guilford@intel.com>
+; Kirk Yap <kirk.s.yap@intel.com>
+; Tim Chen <tim.c.chen@linux.intel.com>
+; Transcoded by:
+; Xiaodong Liu <xiaodong.liu@intel.com>
+;
+; This software is available to you under the OpenIB.org BSD license
+; below:
+;
+; Redistribution and use in source and binary forms, with or
+; without modification, are permitted provided that the following
+; conditions are met:
+;
+; - Redistributions of source code must retain the above
+; copyright notice, this list of conditions and the following
+; disclaimer.
+;
+; - Redistributions in binary form must reproduce the above
+; copyright notice, this list of conditions and the following
+; disclaimer in the documentation and/or other materials
+; provided with the distribution.
+;
+; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+; BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+; ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+; CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+; SOFTWARE.
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; This code is described in an Intel White-Paper:
+; "Fast SHA-256 Implementations on Intel Architecture Processors"
+;
+; To find it, surf to http://www.intel.com/p/en_US/embedded
+; and search for that title.
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+%endif
+
+%xdefine X0 xmm4
+%xdefine X1 xmm5
+%xdefine X2 xmm6
+%xdefine X3 xmm7
+
+%xdefine XTMP0 xmm0
+%xdefine XTMP1 xmm1
+%xdefine XTMP2 xmm2
+%xdefine XTMP3 xmm3
+%xdefine XTMP4 xmm8
+%xdefine XFER xmm9
+
+%define SHUF_00BA xmm10 ; shuffle xBxA -> 00BA
+%define SHUF_DC00 xmm11 ; shuffle xDxC -> DC00
+%define BYTE_FLIP_MASK xmm12
+
+; arg index is start from 0 while mgr_flush/submit is from 1
+%define MGR arg0 ; rdi or rcx
+%define NBLK arg1 ; rsi or rdx
+%define IDX r8 ; local variable -- consistent with caller
+%define NLANX4 r10 ; consistent with caller, should be r10
+
+%define TMGR r9 ; data pointer stored in stack named _TMGR
+%define INP r9 ; data pointer stored in stack named _INP
+%define SRND r9 ; clobbers INP
+%define TMP r9 ; local variable -- assistant to address digest
+
+%xdefine TBL rbp
+%xdefine c ecx
+%xdefine d esi
+%xdefine e edx
+%xdefine a eax
+%xdefine b ebx
+
+%xdefine f edi
+%xdefine g r12d
+%xdefine h r11d
+
+%xdefine y0 r13d
+%xdefine y1 r14d
+%xdefine y2 r15d
+
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define _STACK_ALIGN_SIZE 8 ; 0 or 8 depends on pushes
+%define _INP_END_SIZE 8
+%define _INP_SIZE 8
+%define _TMGR_SIZE 8
+%define _XFER_SIZE 16
+%define _XMM_SAVE_SIZE 0
+%define _GPR_SAVE_SIZE 8*9 ;rbx, rdx, rbp, (rdi, rsi), r12~r15
+
+%define _STACK_ALIGN 0
+%define _INP_END (_STACK_ALIGN + _STACK_ALIGN_SIZE)
+%define _INP (_INP_END + _INP_END_SIZE)
+%define _TMGR (_INP + _INP_SIZE)
+%define _XFER (_TMGR + _TMGR_SIZE)
+%define _XMM_SAVE (_XFER + _XFER_SIZE)
+%define _GPR_SAVE (_XMM_SAVE + _XMM_SAVE_SIZE)
+%define STACK_SIZE (_GPR_SAVE + _GPR_SAVE_SIZE)
+
+;; assume buffers not aligned
+%define MOVDQ movdqu
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros
+
+; addm [mem], reg
+; Add reg to mem using reg-mem add and store
+%macro addm 2
+ add %2, %1 ;changed
+ mov %1, %2 ;changed
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
+; Load xmm with mem and byte swap each dword
+%macro COPY_XMM_AND_BSWAP 3
+ MOVDQ %1, %2 ;changed
+ pshufb %1, %3 ;changed
+%endmacro
+
+; rotate_Xs
+; Rotate values of symbols X0...X3
+%macro rotate_Xs 0
+%xdefine X_ X0
+%xdefine X0 X1
+%xdefine X1 X2
+%xdefine X2 X3
+%xdefine X3 X_
+%endmacro
+
+; ROTATE_ARGS
+; Rotate values of symbols a...h
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endmacro
+
+%macro FOUR_ROUNDS_AND_SCHED 0
+ ;; compute s0 four at a time and s1 two at a time
+ ;; compute W[-16] + W[-7] 4 at a time
+ movdqa XTMP0, X3
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ palignr XTMP0, X2, 4 ; XTMP0 = W[-7]
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ movdqa XTMP1, X1
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ paddd XTMP0, X0 ; XTMP0 = W[-7] + W[-16]
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ ;; compute s0
+ palignr XTMP1, X0, 4 ; XTMP1 = W[-15]
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ movdqa XTMP2, XTMP1 ; XTMP2 = W[-15]
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, y0 ; y2 = S1 + CH
+ add y2 , [rsp + _XFER] ; y2 = k + w + S1 + CH
+ movdqa XTMP3, XTMP1 ; XTMP3 = W[-15]
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pslld XTMP1, (32-7) ;
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ psrld XTMP2, 7 ;
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ por XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP2, XTMP3 ; XTMP2 = W[-15]
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ movdqa XTMP4, XTMP3 ; XTMP4 = W[-15]
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ pslld XTMP3, (32-18) ;
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ psrld XTMP2, 18 ;
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP1, XTMP3
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ psrld XTMP4, 3 ; XTMP4 = W[-15] >> 3
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + (1*4 + _XFER)] ; y2 = k + w + S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ pxor XTMP1, XTMP2 ; XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pxor XTMP1, XTMP4 ; XTMP1 = s0
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ ;; compute low s1
+ pshufd XTMP2, X3, 11111010B ; XTMP2 = W[-2] {BBAA}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ paddd XTMP0, XTMP1 ; XTMP0 = W[-16] + W[-7] + s0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {BBAA}
+ mov y0, e ; y0 = e
+ mov y1, a ; y1 = a
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ movdqa XTMP4, XTMP2 ; XTMP4 = W[-2] {BBAA}
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xBxA}
+ xor y2, g ; y2 = f^g
+ psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xBxA}
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ and y2, e ; y2 = (f^g)&e
+ psrld XTMP4, 10 ; XTMP4 = W[-2] >> 10 {BBAA}
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ pxor XTMP2, XTMP3
+ add y2, y0 ; y2 = S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ add y2, [rsp + (2*4 + _XFER)] ; y2 = k + w + S1 + CH
+ pxor XTMP4, XTMP2 ; XTMP4 = s1 {xBxA}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pshufb XTMP4, SHUF_00BA ; XTMP4 = s1 {00BA}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ paddd XTMP0, XTMP4 ; XTMP0 = {..., ..., W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ ;; compute high s1
+ pshufd XTMP2, XTMP0, 01010000B ; XTMP2 = W[-2] {BBAA}
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ movdqa XTMP3, XTMP2 ; XTMP3 = W[-2] {DDCC}
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ movdqa X0, XTMP2 ; X0 = W[-2] {DDCC}
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ mov y2, f ; y2 = f
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ psrlq XTMP2, 17 ; XTMP2 = W[-2] ror 17 {xDxC}
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ xor y2, g ; y2 = f^g
+ psrlq XTMP3, 19 ; XTMP3 = W[-2] ror 19 {xDxC}
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25
+ and y2, e ; y2 = (f^g)&e
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ psrld X0, 10 ; X0 = W[-2] >> 10 {DDCC}
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ pxor XTMP2, XTMP3 ;
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
+ add y2, y0 ; y2 = S1 + CH
+ add y2, [rsp + (3*4 + _XFER)] ; y2 = k + w + S1 + CH
+ pxor X0, XTMP2 ; X0 = s1 {xDxC}
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ pshufb X0, SHUF_DC00 ; X0 = s1 {DC00}
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ paddd X0, XTMP0 ; X0 = {W[3], W[2], W[1], W[0]}
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+
+ ROTATE_ARGS
+ rotate_Xs
+%endmacro
+
+;; input is [rsp + _XFER + %1 * 4]
+%macro DO_ROUND 1
+ mov y0, e ; y0 = e
+ ror y0, (25-11) ; y0 = e >> (25-11)
+ mov y1, a ; y1 = a
+ xor y0, e ; y0 = e ^ (e >> (25-11))
+ ror y1, (22-13) ; y1 = a >> (22-13)
+ mov y2, f ; y2 = f
+ xor y1, a ; y1 = a ^ (a >> (22-13)
+ ror y0, (11-6) ; y0 = (e >> (11-6)) ^ (e >> (25-6))
+ xor y2, g ; y2 = f^g
+ xor y0, e ; y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
+ ror y1, (13-2) ; y1 = (a >> (13-2)) ^ (a >> (22-2))
+ and y2, e ; y2 = (f^g)&e
+ xor y1, a ; y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
+ ror y0, 6 ; y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
+ xor y2, g ; y2 = CH = ((f^g)&e)^g
+ add y2, y0 ; y2 = S1 + CH
+ ror y1, 2 ; y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
+ %xdefine offset (%1 * 4 + _XFER)
+ add y2, [rsp + offset] ; y2 = k + w + S1 + CH
+ mov y0, a ; y0 = a
+ add h, y2 ; h = h + S1 + CH + k + w
+ mov y2, a ; y2 = a
+ or y0, c ; y0 = a|c
+ add d, h ; d = d + h + S1 + CH + k + w
+ and y2, c ; y2 = a&c
+ and y0, b ; y0 = (a|c)&b
+ add h, y1 ; h = h + S1 + CH + k + w + S0
+ or y0, y2 ; y0 = MAJ = (a|c)&b)|(a&c)
+ add h, y0 ; h = h + S1 + CH + k + w + S0 + MAJ
+ ROTATE_ARGS
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha1_opt_x1(SHA1_MB_ARGS_Xn *args, uint32_t size_in_blocks);
+; arg 0 : MGR : pointer to args (only 4 of the 16 lanes used)
+; arg 1 : NBLK : size (in blocks) ;; assumed to be >= 1
+; invisibile arg 2 : IDX : hash on which lane
+; invisibile arg 3 : NLANX4 : max lanes*4 for this arch (digest is placed by it)
+; (sse/avx is 4, avx2 is 8, avx512 is 16)
+;
+; Clobbers registers: all general regs, xmm0-xmm12
+; {rbx, rdx, rbp, (rdi, rsi), r12~r15 are saved on stack}
+;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+section .text
+mk_global sha256_opt_x1, function, internal
+sha256_opt_x1:
+ endbranch
+ sub rsp, STACK_SIZE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ mov [rsp + _GPR_SAVE + 8*3], rsi
+ ; caller has already stored XMM6~10
+%endif
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+ mov [rsp + _GPR_SAVE + 8*8], rdx
+
+ shl NBLK, 6 ; convert to bytes
+ jz done_hash
+
+ ; detach idx from nlanx4
+ mov IDX, NLANX4
+ shr NLANX4, 8
+ and IDX, 0xff
+
+ mov [rsp + _TMGR], MGR
+ ;; Load input pointers
+ mov INP, [MGR + _data_ptr + IDX*8]
+ mov [rsp + _INP], INP
+ ;; nblk is used to indicate data end
+ add NBLK, INP
+ mov [rsp + _INP_END], NBLK ; pointer to end of data
+
+
+ mov TMGR, [rsp + _TMGR]
+ ;; load initial digest
+ lea TMP, [TMGR + 4*IDX]
+ mov a, [TMP + 0*NLANX4]
+ mov b, [TMP + 1*NLANX4]
+ mov c, [TMP + 2*NLANX4]
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ mov d, [TMP + 1*NLANX4]
+ mov e, [TMP + 2*NLANX4]
+ mov g, [TMP + 4*NLANX4]
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4
+ mov f, [TMP + 2*NLANX4]
+ mov h, [TMP + 4*NLANX4]
+
+ movdqa BYTE_FLIP_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+ movdqa SHUF_00BA, [_SHUF_00BA]
+ movdqa SHUF_DC00, [_SHUF_DC00]
+
+ mov INP, [rsp + _INP]
+loop0:
+ lea TBL, [K256]
+
+ ;; byte swap first 16 dwords
+ COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X1, [INP + 1*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X2, [INP + 2*16], BYTE_FLIP_MASK
+ COPY_XMM_AND_BSWAP X3, [INP + 3*16], BYTE_FLIP_MASK
+
+ mov [rsp + _INP], INP
+
+ ;; schedule 48 input dwords, by doing 3 rounds of 16 each
+ mov SRND, 3
+
+loop1:
+ movdqa XFER, [TBL]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 1*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 2*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ FOUR_ROUNDS_AND_SCHED
+
+ movdqa XFER, [TBL + 3*16]
+ paddd XFER, X0
+ movdqa [rsp + _XFER], XFER
+ add TBL, 4*16
+ FOUR_ROUNDS_AND_SCHED
+
+ sub SRND, 1
+ jne loop1
+
+ mov SRND, 2
+loop2:
+ paddd X0, [TBL]
+ movdqa [rsp + _XFER], X0
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+ paddd X1, [TBL + 1*16]
+ movdqa [rsp + _XFER], X1
+ add TBL, 2*16
+ DO_ROUND 0
+ DO_ROUND 1
+ DO_ROUND 2
+ DO_ROUND 3
+
+ movdqa X0, X2
+ movdqa X1, X3
+
+ sub SRND, 1
+ jne loop2
+
+ ; write out digests
+ mov TMGR, [rsp + _TMGR]
+ lea TMP, [TMGR + 4*IDX]
+ addm a, [TMP + 0*NLANX4]
+ addm b, [TMP + 1*NLANX4]
+ addm c, [TMP + 2*NLANX4]
+ lea TMP, [TMP + 2*NLANX4] ; MGR + 4*IDX + 2*NLANX4
+ addm d, [TMP + 1*NLANX4]
+ addm e, [TMP + 2*NLANX4]
+ addm g, [TMP + 4*NLANX4]
+ lea TMP, [TMP + 1*NLANX4] ; MGR + 4*IDX + 3*NLANX4
+ addm f, [TMP + 2*NLANX4]
+ addm h, [TMP + 4*NLANX4]
+
+ mov INP, [rsp + _INP]
+ add INP, 64
+ cmp INP, [rsp + _INP_END]
+ jne loop0
+
+done_hash:
+ mov MGR, [rsp + _TMGR]
+
+ mov rdx, [rsp + _GPR_SAVE + 8*8]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rsi, [rsp + _GPR_SAVE + 8*3]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbp, [rsp + _GPR_SAVE + 8*1]
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SIZE
+
+ ret
+
+section .data
+align 64
+K256:
+ DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+
+PSHUFFLE_BYTE_FLIP_MASK:
+ DQ 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+; shuffle xBxA -> 00BA
+_SHUF_00BA:
+ DQ 0x0b0a090803020100, 0xFFFFFFFFFFFFFFFF
+
+; shuffle xDxC -> DC00
+_SHUF_DC00:
+ DQ 0xFFFFFFFFFFFFFFFF, 0x0b0a090803020100
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
new file mode 100644
index 000000000..c3515dc52
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
@@ -0,0 +1,204 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA256 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define H0 0x6a09e667
+#define H1 0xbb67ae85
+#define H2 0x3c6ef372
+#define H3 0xa54ff53a
+#define H4 0x510e527f
+#define H5 0x9b05688c
+#define H6 0x1f83d9ab
+#define H7 0x5be0cd19
+
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+
+#define W(x) w[(x) & 15]
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be32(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+static void OPT_FIX sha256_single(const uint8_t * data, uint32_t digest[]);
+
+void sha256_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA256_BLOCK_SIZE];
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+ digest[4] = H4;
+ digest[5] = H5;
+ digest[6] = H6;
+ digest[7] = H7;
+
+ i = len;
+ while (i >= SHA256_BLOCK_SIZE) {
+ sha256_single(input_data, digest);
+ input_data += SHA256_BLOCK_SIZE;
+ i -= SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA256_BLOCK_SIZE;
+ else
+ i = SHA256_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ sha256_single(buf, digest);
+ if (i == 2 * SHA256_BLOCK_SIZE)
+ sha256_single(buf + SHA256_BLOCK_SIZE, digest);
+}
+
+void sha256_single(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e, f, g, h, t1, t2;
+ uint32_t w[16];
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+ step(1, h, a, b, c, d, e, f, g, 0x71374491);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+ step(31, b, c, d, e, f, g, h, a, 0x14292967);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+ step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+ step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am
new file mode 100644
index 000000000..4ba7d1049
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am
@@ -0,0 +1,108 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += sha512_mb/sha512_ctx_sse.c \
+ sha512_mb/sha512_ctx_avx.c \
+ sha512_mb/sha512_ctx_avx2.c \
+ sha512_mb/sha512_ctx_sb_sse4.c \
+ sha512_mb/sha512_ctx_base.c
+
+lsrc_x86_64 += sha512_mb/sha512_mb_mgr_init_sse.c \
+ sha512_mb/sha512_mb_mgr_init_avx2.c \
+ sha512_mb/sha512_sb_mgr_init_sse4.c
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_x86_64 += sha512_mb/sha512_mb_mgr_submit_sse.asm \
+ sha512_mb/sha512_mb_mgr_submit_avx.asm \
+ sha512_mb/sha512_mb_mgr_submit_avx2.asm \
+ sha512_mb/sha512_mb_mgr_flush_sse.asm \
+ sha512_mb/sha512_mb_mgr_flush_avx.asm \
+ sha512_mb/sha512_mb_mgr_flush_avx2.asm \
+ sha512_mb/sha512_mb_x2_sse.asm \
+ sha512_mb/sha512_mb_x2_avx.asm \
+ sha512_mb/sha512_mb_x4_avx2.asm \
+ sha512_mb/sha512_multibinary.asm \
+ sha512_mb/sha512_sb_mgr_submit_sse4.c \
+ sha512_mb/sha512_sb_mgr_flush_sse4.c \
+ sha512_mb/sha512_sse4.asm
+
+lsrc_x86_64 += sha512_mb/sha512_ctx_avx512.c \
+ sha512_mb/sha512_mb_mgr_init_avx512.c \
+ sha512_mb/sha512_mb_mgr_submit_avx512.asm \
+ sha512_mb/sha512_mb_mgr_flush_avx512.asm \
+ sha512_mb/sha512_mb_x8_avx512.asm
+
+lsrc_x86_32 += $(lsrc_x86_64)
+
+lsrc_aarch64 += sha512_mb/sha512_ctx_base.c \
+ sha512_mb/aarch64/sha512_mb_multibinary.S \
+ sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c \
+ sha512_mb/aarch64/sha512_ctx_ce.c \
+ sha512_mb/aarch64/sha512_mb_mgr_ce.c \
+ sha512_mb/aarch64/sha512_mb_x1_ce.S \
+ sha512_mb/aarch64/sha512_mb_x2_ce.S
+
+lsrc_base_aliases += sha512_mb/sha512_ctx_base.c \
+ sha512_mb/sha512_ctx_base_aliases.c
+
+src_include += -I $(srcdir)/sha512_mb
+
+extern_hdrs += include/sha512_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ sha512_mb/sha512_job.asm \
+ sha512_mb/sha512_mb_mgr_datastruct.asm \
+ include/reg_sizes.asm \
+ sha512_mb/sha512_ref.c \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h
+
+check_tests += sha512_mb/sha512_mb_test \
+ sha512_mb/sha512_mb_rand_test \
+ sha512_mb/sha512_mb_rand_update_test
+
+unit_tests += sha512_mb/sha512_mb_rand_ssl_test
+
+perf_tests += sha512_mb/sha512_mb_vs_ossl_perf
+
+sha512_mb_rand_test: sha512_ref.o
+sha512_mb_sha512_mb_rand_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la
+
+sha512_mb_rand_update_test: sha512_ref.o
+sha512_mb_sha512_mb_rand_update_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la
+
+sha512_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha512_mb_sha512_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha512_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha512_mb_sha512_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c
new file mode 100644
index 000000000..02f04197b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_ctx_ce.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+void sha512_mb_mgr_init_ce(SHA512_MB_JOB_MGR * state);
+SHA512_JOB *sha512_mb_mgr_submit_ce(SHA512_MB_JOB_MGR * state, SHA512_JOB * job);
+SHA512_JOB *sha512_mb_mgr_flush_ce(SHA512_MB_JOB_MGR * state);
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_ce(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_mb_mgr_init_ce(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_ce(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA512_HASH_CTX *) sha512_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_ce(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_ce(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_ce(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA512_HASH_CTX *) sha512_mb_mgr_submit_ce(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_ce_slver_02020142;
+struct slver sha512_ctx_mgr_init_ce_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_submit_ce_slver_02020143;
+struct slver sha512_ctx_mgr_submit_ce_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_flush_ce_slver_02020144;
+struct slver sha512_ctx_mgr_flush_ce_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..321e8507d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_aarch64_dispatcher.c
@@ -0,0 +1,59 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sha512_ctx_mgr_submit)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA3)
+ return PROVIDER_INFO(sha512_ctx_mgr_submit_ce);
+
+ return PROVIDER_BASIC(sha512_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha512_ctx_mgr_init)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA3)
+ return PROVIDER_INFO(sha512_ctx_mgr_init_ce);
+
+ return PROVIDER_BASIC(sha512_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sha512_ctx_mgr_flush)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SHA3)
+ return PROVIDER_INFO(sha512_ctx_mgr_flush_ce);
+
+ return PROVIDER_BASIC(sha512_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c
new file mode 100644
index 000000000..43801c3d6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_mgr_ce.c
@@ -0,0 +1,210 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sha512_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef SHA512_MB_CE_MAX_LANES
+#define SHA512_MB_CE_MAX_LANES 2
+#endif
+
+#if SHA512_MB_CE_MAX_LANES >=2
+void sha512_mb_ce_x2(SHA512_JOB *, SHA512_JOB *, int);
+#endif
+void sha512_mb_ce_x1(SHA512_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sha512_mb_mgr_init_ce(SHA512_MB_JOB_MGR * state)
+{
+ int i;
+ //~ state->unused_lanes = 0xf3210;
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = SHA512_MB_CE_MAX_LANES - 1; i >= 0; i--) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > SHA1_MB_CE_MAX_LANES is invalid lane
+ for (i = SHA512_MB_CE_MAX_LANES; i < SHA512_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sha512_mb_mgr_do_jobs(SHA512_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i, lanes;
+
+ int lane_idx_array[SHA512_MAX_LANES];
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+#if SHA512_MB_CE_MAX_LANES == 2
+ if (state->num_lanes_inuse == 2) {
+ len = min(state->lens[0], state->lens[1]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sha512_mb_ce_x2(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane, len >> 4);
+
+ } else
+#endif
+ {
+ lanes = 0, len = 0;
+ for (i = 0; i < SHA512_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ if (lanes)
+ len = min(len, state->lens[i]);
+ else
+ len = state->lens[i];
+ lane_idx_array[lanes] = i;
+ lanes++;
+ }
+ }
+ if (lanes == 0)
+ return -1;
+ lane_idx = len & 0xf;
+ len = len & (~0xf);
+
+#if SHA512_MB_CE_MAX_LANES >=2
+ if (lanes == 2) {
+ sha512_mb_ce_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+ } else
+#endif
+ {
+ sha512_mb_ce_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+ }
+ }
+ //only return the min length job
+ for (i = 0; i < SHA512_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 3;
+ }
+ }
+
+ return lane_idx;
+
+}
+
+static SHA512_JOB *sha512_mb_mgr_free_lane(SHA512_MB_JOB_MGR * state)
+{
+ int i;
+ SHA512_JOB *ret = NULL;
+
+ for (i = 0; i < SHA512_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sha512_mb_mgr_insert_job(SHA512_MB_JOB_MGR * state, SHA512_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ //fatal error
+ assert(lane_idx < SHA512_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SHA512_JOB *sha512_mb_mgr_submit_ce(SHA512_MB_JOB_MGR * state, SHA512_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SHA512_JOB *ret;
+
+ //add job into lanes
+ sha512_mb_mgr_insert_job(state, job);
+
+ ret = sha512_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < SHA512_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sha512_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sha512_mb_mgr_do_jobs(state);
+#endif
+
+ //~ i = lane_idx;
+ ret = sha512_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SHA512_JOB *sha512_mb_mgr_flush_ce(SHA512_MB_JOB_MGR * state)
+{
+ SHA512_JOB *ret;
+ ret = sha512_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sha512_mb_mgr_do_jobs(state);
+ return sha512_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S
new file mode 100644
index 000000000..58bf13478
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_multibinary.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include <aarch64_multibinary.h>
+
+
+mbin_interface sha512_ctx_mgr_submit
+mbin_interface sha512_ctx_mgr_init
+mbin_interface sha512_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S
new file mode 100644
index 000000000..ab5d0aed7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x1_ce.S
@@ -0,0 +1,269 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a+crypto+sha3
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 0-63
+*/
+.macro sha512_rounds_low ab,cd,ef,gh,tmp,msg0,msg1,msg4,msg5,msg7
+ ldr key_q , [key_adr]
+ add l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d
+ add key_adr,key_adr,16
+ ext l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8
+ ext l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8
+ ext l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8
+ add l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d
+ ext l0_tmp0_v.16b,l0_\msg4\()_v.16b,l0_\msg5\()_v.16b,#8
+ sha512su0 l0_\msg0\()_v.2d,l0_\msg1\()_v.2d
+ sha512h l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d
+ sha512su1 l0_\msg0\()_v.2d,l0_\msg7\()_v.2d,l0_tmp0_v.2d
+ add l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d
+ sha512h2 l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d
+.endm
+/**
+maros for round 64-79
+*/
+.macro sha512_rounds_high ab,cd,ef,gh,tmp,msg0
+ ldr key_q , [key_adr]
+ add l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d
+ add key_adr,key_adr,16
+ ext l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8
+ ext l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8
+ ext l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8
+ add l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d
+ sha512h l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d
+ add l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d
+ sha512h2 l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,31
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_ab,0
+ declare_var_vector_reg l0_cd,1
+ declare_var_vector_reg l0_ef,2
+ declare_var_vector_reg l0_gh,3
+
+ declare_var_vector_reg l0_tmp,4
+ declare_var_vector_reg l0_ab_saved,24
+ declare_var_vector_reg l0_cd_saved,25
+ declare_var_vector_reg l0_ef_saved,26
+ declare_var_vector_reg l0_gh_saved,27
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,5
+ declare_var_vector_reg l0_tmp1,6
+ declare_var_vector_reg l0_tmp2,7
+
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+ declare_var_vector_reg l0_msg4,20
+ declare_var_vector_reg l0_msg5,21
+ declare_var_vector_reg l0_msg6,22
+ declare_var_vector_reg l0_msg7,23
+
+
+
+/*
+ void sha512_mb_ce_x1(SHA1_JOB * l0_job, int len);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ len .req w1
+ l0_data .req x2
+ key_adr .req x3
+ .global sha512_mb_ce_x1
+ .type sha512_mb_ce_x1, %function
+sha512_mb_ce_x1:
+ ldr l0_data, [l0_job]
+ // load initial digest
+ add x4,l0_job,64
+ ld1 {l0_ab_v.4s-l0_gh_v.4s},[x4]
+
+
+
+start_loop:
+ adr key_adr, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ add l0_data,l0_data,64
+ ld1 {l0_msg4_v.4s-l0_msg7_v.4s},[l0_data]
+ add l0_data,l0_data,64
+ //adjust loop parameter
+
+ sub len, len, #1
+ cmp len, 0
+
+ //save state
+ mov l0_ab_saved_v.16b,l0_ab_v.16b
+ mov l0_cd_saved_v.16b,l0_cd_v.16b
+ mov l0_ef_saved_v.16b,l0_ef_v.16b
+ mov l0_gh_saved_v.16b,l0_gh_v.16b
+
+ //rev endian
+ rev64 l0_msg0_v.16b,l0_msg0_v.16b
+ rev64 l0_msg1_v.16b,l0_msg1_v.16b
+ rev64 l0_msg2_v.16b,l0_msg2_v.16b
+ rev64 l0_msg3_v.16b,l0_msg3_v.16b
+ rev64 l0_msg4_v.16b,l0_msg4_v.16b
+ rev64 l0_msg5_v.16b,l0_msg5_v.16b
+ rev64 l0_msg6_v.16b,l0_msg6_v.16b
+ rev64 l0_msg7_v.16b,l0_msg7_v.16b
+
+
+
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg0,msg1,msg4,msg5,msg7 /* rounds 0- 1 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg1,msg2,msg5,msg6,msg0 /* rounds 2- 3 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg2,msg3,msg6,msg7,msg1 /* rounds 4- 5 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg3,msg4,msg7,msg0,msg2 /* rounds 6- 7 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg4,msg5,msg0,msg1,msg3 /* rounds 8- 9 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg5,msg6,msg1,msg2,msg4 /* rounds 10-11 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg6,msg7,msg2,msg3,msg5 /* rounds 12-13 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg7,msg0,msg3,msg4,msg6 /* rounds 14-15 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg0,msg1,msg4,msg5,msg7 /* rounds 16-17 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg1,msg2,msg5,msg6,msg0 /* rounds 18-19 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg2,msg3,msg6,msg7,msg1 /* rounds 20-21 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg3,msg4,msg7,msg0,msg2 /* rounds 22-23 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg4,msg5,msg0,msg1,msg3 /* rounds 24-25 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg5,msg6,msg1,msg2,msg4 /* rounds 26-27 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg6,msg7,msg2,msg3,msg5 /* rounds 28-29 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg7,msg0,msg3,msg4,msg6 /* rounds 30-31 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg0,msg1,msg4,msg5,msg7 /* rounds 32-33 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg1,msg2,msg5,msg6,msg0 /* rounds 34-35 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg2,msg3,msg6,msg7,msg1 /* rounds 36-37 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg3,msg4,msg7,msg0,msg2 /* rounds 38-39 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg4,msg5,msg0,msg1,msg3 /* rounds 40-41 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg5,msg6,msg1,msg2,msg4 /* rounds 42-43 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg6,msg7,msg2,msg3,msg5 /* rounds 44-45 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg7,msg0,msg3,msg4,msg6 /* rounds 46-47 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg0,msg1,msg4,msg5,msg7 /* rounds 48-49 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg1,msg2,msg5,msg6,msg0 /* rounds 50-51 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg2,msg3,msg6,msg7,msg1 /* rounds 52-53 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg3,msg4,msg7,msg0,msg2 /* rounds 54-55 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg4,msg5,msg0,msg1,msg3 /* rounds 56-57 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg5,msg6,msg1,msg2,msg4 /* rounds 58-59 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg6,msg7,msg2,msg3,msg5 /* rounds 60-61 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg7,msg0,msg3,msg4,msg6 /* rounds 62-63 */
+ sha512_rounds_high ef, gh, cd,tmp, ab,msg0 /* rounds 64-65 */
+ sha512_rounds_high tmp, ef, ab, cd, gh,msg1 /* rounds 66-67 */
+ sha512_rounds_high cd,tmp, gh, ab, ef,msg2 /* rounds 68-69 */
+ sha512_rounds_high ab, cd, ef, gh,tmp,msg3 /* rounds 70-71 */
+ sha512_rounds_high gh, ab,tmp, ef, cd,msg4 /* rounds 72-73 */
+ sha512_rounds_high ef, gh, cd,tmp, ab,msg5 /* rounds 74-75 */
+ sha512_rounds_high tmp, ef, ab, cd, gh,msg6 /* rounds 76-77 */
+ sha512_rounds_high cd,tmp, gh, ab, ef,msg7 /* rounds 78-79 */
+
+
+
+ add l0_ab_v.2d,l0_ab_v.2d,l0_ab_saved_v.2d
+ add l0_cd_v.2d,l0_cd_v.2d,l0_cd_saved_v.2d
+ add l0_ef_v.2d,l0_ef_v.2d,l0_ef_saved_v.2d
+ add l0_gh_v.2d,l0_gh_v.2d,l0_gh_saved_v.2d
+
+
+ bgt start_loop
+
+ add x4,l0_job,64
+ st1 {l0_ab_v.4s-l0_gh_v.4s},[x4]
+
+
+ ret
+
+ .size sha512_mb_ce_x1, .-sha512_mb_ce_x1
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .quad 0x428a2f98d728ae22, 0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538, 0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242, 0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235, 0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+ .quad 0x983e5152ee66dfab, 0xa831c66d2db43210
+ .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
+ .quad 0x06ca6351e003826f, 0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+ .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6, 0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218, 0xd69906245565a910
+ .quad 0xf40e35855771202a, 0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
+ .quad 0x90befffa23631e28, 0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
+ .quad 0xca273eceea26619c, 0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae, 0x1b710b35131c471b
+ .quad 0x28db77f523047d84, 0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S
new file mode 100644
index 000000000..7864eb028
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/aarch64/sha512_mb_x2_ce.S
@@ -0,0 +1,390 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a+crypto+sha3
+ .text
+ .align 2
+ .p2align 3,,7
+
+/*
+Macros
+*/
+
+.macro declare_var_vector_reg name:req,reg:req
+ \name\()_q .req q\reg
+ \name\()_v .req v\reg
+ \name\()_s .req s\reg
+.endm
+/**
+maros for round 0-63
+*/
+.macro sha512_rounds_low ab,cd,ef,gh,tmp,msg0,msg1,msg4,msg5,msg7
+ ldr key_q , [key_adr]
+ add l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d
+ add l1_tmp0_v.2d,l1_\msg0\()_v.2d,key_v.2d
+ add key_adr,key_adr,16
+
+
+ ext l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8
+ ext l1_tmp1_v.16b,l1_\ef\()_v.16b,l1_\gh\()_v.16b,#8
+
+
+ ext l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8
+ ext l1_tmp0_v.16b,l1_tmp0_v.16b,l1_tmp0_v.16b,#8
+
+
+ ext l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8
+ ext l1_tmp2_v.16b,l1_\cd\()_v.16b,l1_\ef\()_v.16b,#8
+
+
+ add l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d
+ add l1_\gh\()_v.2d,l1_\gh\()_v.2d,l1_tmp0_v.2d
+
+
+ ext l0_tmp0_v.16b,l0_\msg4\()_v.16b,l0_\msg5\()_v.16b,#8
+ ext l1_tmp0_v.16b,l1_\msg4\()_v.16b,l1_\msg5\()_v.16b,#8
+
+ sha512su0 l0_\msg0\()_v.2d,l0_\msg1\()_v.2d
+ sha512su0 l1_\msg0\()_v.2d,l1_\msg1\()_v.2d
+
+ sha512h l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d
+ sha512h l1_\gh\()_q,l1_tmp1_q,l1_tmp2_v.2d
+
+ sha512su1 l0_\msg0\()_v.2d,l0_\msg7\()_v.2d,l0_tmp0_v.2d
+ sha512su1 l1_\msg0\()_v.2d,l1_\msg7\()_v.2d,l1_tmp0_v.2d
+
+ add l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d
+ add l1_\tmp\()_v.2d,l1_\cd\()_v.2d,l1_\gh\()_v.2d
+
+ sha512h2 l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d
+ sha512h2 l1_\gh\()_q,l1_\cd\()_q,l1_\ab\()_v.2d
+.endm
+
+/**
+maros for round 64-79
+*/
+.macro sha512_rounds_high ab,cd,ef,gh,tmp,msg0
+ ldr key_q , [key_adr]
+ add l0_tmp0_v.2d,l0_\msg0\()_v.2d,key_v.2d
+ add l1_tmp0_v.2d,l1_\msg0\()_v.2d,key_v.2d
+ add key_adr,key_adr,16
+
+
+ ext l0_tmp1_v.16b,l0_\ef\()_v.16b,l0_\gh\()_v.16b,#8
+ ext l1_tmp1_v.16b,l1_\ef\()_v.16b,l1_\gh\()_v.16b,#8
+
+
+ ext l0_tmp0_v.16b,l0_tmp0_v.16b,l0_tmp0_v.16b,#8
+ ext l1_tmp0_v.16b,l1_tmp0_v.16b,l1_tmp0_v.16b,#8
+
+
+ ext l0_tmp2_v.16b,l0_\cd\()_v.16b,l0_\ef\()_v.16b,#8
+ ext l1_tmp2_v.16b,l1_\cd\()_v.16b,l1_\ef\()_v.16b,#8
+
+
+ add l0_\gh\()_v.2d,l0_\gh\()_v.2d,l0_tmp0_v.2d
+ add l1_\gh\()_v.2d,l1_\gh\()_v.2d,l1_tmp0_v.2d
+
+
+
+ sha512h l0_\gh\()_q,l0_tmp1_q,l0_tmp2_v.2d
+ sha512h l1_\gh\()_q,l1_tmp1_q,l1_tmp2_v.2d
+
+
+ add l0_\tmp\()_v.2d,l0_\cd\()_v.2d,l0_\gh\()_v.2d
+ add l1_\tmp\()_v.2d,l1_\cd\()_v.2d,l1_\gh\()_v.2d
+
+ sha512h2 l0_\gh\()_q,l0_\cd\()_q,l0_\ab\()_v.2d
+ sha512h2 l1_\gh\()_q,l1_\cd\()_q,l1_\ab\()_v.2d
+.endm
+
+
+/*
+Variable list
+*/
+
+ declare_var_vector_reg key,6
+
+
+/*
+digest variables
+*/
+ declare_var_vector_reg l0_ab,0
+ declare_var_vector_reg l0_cd,1
+ declare_var_vector_reg l0_ef,2
+ declare_var_vector_reg l0_gh,3
+ declare_var_vector_reg l0_tmp,4
+
+ declare_var_vector_reg l1_ab,8
+ declare_var_vector_reg l1_cd,9
+ declare_var_vector_reg l1_ef,10
+ declare_var_vector_reg l1_gh,11
+ declare_var_vector_reg l1_tmp,12
+
+
+ declare_var_vector_reg l0_ab_saved,16
+ declare_var_vector_reg l0_cd_saved,17
+ declare_var_vector_reg l0_ef_saved,18
+ declare_var_vector_reg l0_gh_saved,19
+ declare_var_vector_reg l1_ab_saved,24
+ declare_var_vector_reg l1_cd_saved,25
+ declare_var_vector_reg l1_ef_saved,26
+ declare_var_vector_reg l1_gh_saved,27
+/*
+Temporay variables
+*/
+ declare_var_vector_reg l0_tmp0,5
+ declare_var_vector_reg l0_tmp1,6
+ declare_var_vector_reg l0_tmp2,7
+
+ declare_var_vector_reg l1_tmp0,13
+ declare_var_vector_reg l1_tmp1,14
+ declare_var_vector_reg l1_tmp2,15
+
+
+
+/*
+Message variables
+*/
+ declare_var_vector_reg l0_msg0,16
+ declare_var_vector_reg l0_msg1,17
+ declare_var_vector_reg l0_msg2,18
+ declare_var_vector_reg l0_msg3,19
+ declare_var_vector_reg l0_msg4,20
+ declare_var_vector_reg l0_msg5,21
+ declare_var_vector_reg l0_msg6,22
+ declare_var_vector_reg l0_msg7,23
+
+ declare_var_vector_reg l1_msg0,24
+ declare_var_vector_reg l1_msg1,25
+ declare_var_vector_reg l1_msg2,26
+ declare_var_vector_reg l1_msg3,27
+ declare_var_vector_reg l1_msg4,28
+ declare_var_vector_reg l1_msg5,29
+ declare_var_vector_reg l1_msg6,30
+ declare_var_vector_reg l1_msg7,31
+
+
+
+/*
+ void sha512_mb_ce_x2(SHA512_JOB *, SHA512_JOB *, int);
+*/
+/*
+Arguements list
+*/
+ l0_job .req x0
+ l1_job .req x1
+ len .req w2
+ l0_data .req x3
+ l1_data .req x4
+ key_adr .req x5
+ l0_digest_adr .req x6
+ l1_digest_adr .req x7
+ .global sha512_mb_ce_x2
+ .type sha512_mb_ce_x2, %function
+sha512_mb_ce_x2:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+
+
+ ldr l0_data, [l0_job]
+ ldr l1_data, [l1_job]
+ // load initial digest
+ add l0_digest_adr,l0_job,64
+ add l1_digest_adr,l1_job,64
+ ld1 {l0_ab_v.4s-l0_gh_v.4s},[l0_digest_adr]
+ ld1 {l1_ab_v.4s-l1_gh_v.4s},[l1_digest_adr]
+
+
+
+start_loop:
+
+ adr key_adr, KEY
+ //load msgs
+ ld1 {l0_msg0_v.4s-l0_msg3_v.4s},[l0_data]
+ add l0_data,l0_data,64
+ ld1 {l0_msg4_v.4s-l0_msg7_v.4s},[l0_data]
+ add l0_data,l0_data,64
+
+ ld1 {l1_msg0_v.4s-l1_msg3_v.4s},[l1_data]
+ add l1_data,l1_data,64
+ ld1 {l1_msg4_v.4s-l1_msg7_v.4s},[l1_data]
+ add l1_data,l1_data,64
+
+ //adjust loop parameter
+ sub len, len, #1
+ cmp len, 0
+
+
+
+ //rev endian
+ rev64 l0_msg0_v.16b,l0_msg0_v.16b
+ rev64 l0_msg1_v.16b,l0_msg1_v.16b
+ rev64 l0_msg2_v.16b,l0_msg2_v.16b
+ rev64 l0_msg3_v.16b,l0_msg3_v.16b
+ rev64 l0_msg4_v.16b,l0_msg4_v.16b
+ rev64 l0_msg5_v.16b,l0_msg5_v.16b
+ rev64 l0_msg6_v.16b,l0_msg6_v.16b
+ rev64 l0_msg7_v.16b,l0_msg7_v.16b
+
+ rev64 l1_msg0_v.16b,l1_msg0_v.16b
+ rev64 l1_msg1_v.16b,l1_msg1_v.16b
+ rev64 l1_msg2_v.16b,l1_msg2_v.16b
+ rev64 l1_msg3_v.16b,l1_msg3_v.16b
+ rev64 l1_msg4_v.16b,l1_msg4_v.16b
+ rev64 l1_msg5_v.16b,l1_msg5_v.16b
+ rev64 l1_msg6_v.16b,l1_msg6_v.16b
+ rev64 l1_msg7_v.16b,l1_msg7_v.16b
+
+
+
+
+
+
+
+
+
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg0,msg1,msg4,msg5,msg7 /* rounds 0- 1 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg1,msg2,msg5,msg6,msg0 /* rounds 2- 3 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg2,msg3,msg6,msg7,msg1 /* rounds 4- 5 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg3,msg4,msg7,msg0,msg2 /* rounds 6- 7 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg4,msg5,msg0,msg1,msg3 /* rounds 8- 9 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg5,msg6,msg1,msg2,msg4 /* rounds 10-11 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg6,msg7,msg2,msg3,msg5 /* rounds 12-13 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg7,msg0,msg3,msg4,msg6 /* rounds 14-15 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg0,msg1,msg4,msg5,msg7 /* rounds 16-17 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg1,msg2,msg5,msg6,msg0 /* rounds 18-19 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg2,msg3,msg6,msg7,msg1 /* rounds 20-21 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg3,msg4,msg7,msg0,msg2 /* rounds 22-23 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg4,msg5,msg0,msg1,msg3 /* rounds 24-25 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg5,msg6,msg1,msg2,msg4 /* rounds 26-27 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg6,msg7,msg2,msg3,msg5 /* rounds 28-29 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg7,msg0,msg3,msg4,msg6 /* rounds 30-31 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg0,msg1,msg4,msg5,msg7 /* rounds 32-33 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg1,msg2,msg5,msg6,msg0 /* rounds 34-35 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg2,msg3,msg6,msg7,msg1 /* rounds 36-37 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg3,msg4,msg7,msg0,msg2 /* rounds 38-39 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg4,msg5,msg0,msg1,msg3 /* rounds 40-41 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg5,msg6,msg1,msg2,msg4 /* rounds 42-43 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg6,msg7,msg2,msg3,msg5 /* rounds 44-45 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg7,msg0,msg3,msg4,msg6 /* rounds 46-47 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg0,msg1,msg4,msg5,msg7 /* rounds 48-49 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg1,msg2,msg5,msg6,msg0 /* rounds 50-51 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg2,msg3,msg6,msg7,msg1 /* rounds 52-53 */
+ sha512_rounds_low ef, gh, cd,tmp, ab,msg3,msg4,msg7,msg0,msg2 /* rounds 54-55 */
+ sha512_rounds_low tmp, ef, ab, cd, gh,msg4,msg5,msg0,msg1,msg3 /* rounds 56-57 */
+ sha512_rounds_low cd,tmp, gh, ab, ef,msg5,msg6,msg1,msg2,msg4 /* rounds 58-59 */
+ sha512_rounds_low ab, cd, ef, gh,tmp,msg6,msg7,msg2,msg3,msg5 /* rounds 60-61 */
+ sha512_rounds_low gh, ab,tmp, ef, cd,msg7,msg0,msg3,msg4,msg6 /* rounds 62-63 */
+ sha512_rounds_high ef, gh, cd,tmp, ab,msg0 /* rounds 64-65 */
+ sha512_rounds_high tmp, ef, ab, cd, gh,msg1 /* rounds 66-67 */
+ sha512_rounds_high cd,tmp, gh, ab, ef,msg2 /* rounds 68-69 */
+ sha512_rounds_high ab, cd, ef, gh,tmp,msg3 /* rounds 70-71 */
+ ld1 {l0_ab_saved_v.4s-l0_gh_saved_v.4s},[l0_digest_adr]
+ ld1 {l1_ab_saved_v.4s-l1_gh_saved_v.4s},[l1_digest_adr]
+ sha512_rounds_high gh, ab,tmp, ef, cd,msg4 /* rounds 72-73 */
+ sha512_rounds_high ef, gh, cd,tmp, ab,msg5 /* rounds 74-75 */
+ sha512_rounds_high tmp, ef, ab, cd, gh,msg6 /* rounds 76-77 */
+ sha512_rounds_high cd,tmp, gh, ab, ef,msg7 /* rounds 78-79 */
+
+
+
+ add l0_ab_v.2d,l0_ab_v.2d,l0_ab_saved_v.2d
+ add l0_cd_v.2d,l0_cd_v.2d,l0_cd_saved_v.2d
+ add l0_ef_v.2d,l0_ef_v.2d,l0_ef_saved_v.2d
+ add l0_gh_v.2d,l0_gh_v.2d,l0_gh_saved_v.2d
+ st1 {l0_ab_v.2d-l0_gh_v.2d},[l0_digest_adr]
+
+ add l1_ab_v.2d,l1_ab_v.2d,l1_ab_saved_v.2d
+ add l1_cd_v.2d,l1_cd_v.2d,l1_cd_saved_v.2d
+ add l1_ef_v.2d,l1_ef_v.2d,l1_ef_saved_v.2d
+ add l1_gh_v.2d,l1_gh_v.2d,l1_gh_saved_v.2d
+ st1 {l1_ab_v.2d-l1_gh_v.2d},[l1_digest_adr]
+
+
+
+
+ bgt start_loop
+
+ add x4,l0_job,64
+
+
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+
+ ret
+
+ .size sha512_mb_ce_x2, .-sha512_mb_ce_x2
+ .section .rol0_data.cst16,"aM",@progbits,16
+ .align 4
+KEY:
+ .quad 0x428a2f98d728ae22, 0x7137449123ef65cd
+ .quad 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+ .quad 0x3956c25bf348b538, 0x59f111f1b605d019
+ .quad 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+ .quad 0xd807aa98a3030242, 0x12835b0145706fbe
+ .quad 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+ .quad 0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+ .quad 0x9bdc06a725c71235, 0xc19bf174cf692694
+ .quad 0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+ .quad 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+ .quad 0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+ .quad 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+ .quad 0x983e5152ee66dfab, 0xa831c66d2db43210
+ .quad 0xb00327c898fb213f, 0xbf597fc7beef0ee4
+ .quad 0xc6e00bf33da88fc2, 0xd5a79147930aa725
+ .quad 0x06ca6351e003826f, 0x142929670a0e6e70
+ .quad 0x27b70a8546d22ffc, 0x2e1b21385c26c926
+ .quad 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+ .quad 0x650a73548baf63de, 0x766a0abb3c77b2a8
+ .quad 0x81c2c92e47edaee6, 0x92722c851482353b
+ .quad 0xa2bfe8a14cf10364, 0xa81a664bbc423001
+ .quad 0xc24b8b70d0f89791, 0xc76c51a30654be30
+ .quad 0xd192e819d6ef5218, 0xd69906245565a910
+ .quad 0xf40e35855771202a, 0x106aa07032bbd1b8
+ .quad 0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+ .quad 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+ .quad 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+ .quad 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+ .quad 0x748f82ee5defb2fc, 0x78a5636f43172f60
+ .quad 0x84c87814a1f0ab72, 0x8cc702081a6439ec
+ .quad 0x90befffa23631e28, 0xa4506cebde82bde9
+ .quad 0xbef9a3f7b2c67915, 0xc67178f2e372532b
+ .quad 0xca273eceea26619c, 0xd186b8c721c0c207
+ .quad 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+ .quad 0x06f067aa72176fba, 0x0a637dc5a2c898a6
+ .quad 0x113f9804bef90dae, 0x1b710b35131c471b
+ .quad 0x28db77f523047d84, 0x32caab7b40c72493
+ .quad 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+ .quad 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+ .quad 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c
new file mode 100644
index 000000000..24d96763a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c
@@ -0,0 +1,269 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx")
+#endif
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx_slver_02020166;
+struct slver sha512_ctx_mgr_init_avx_slver = { 0x0166, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_submit_avx_slver_02020167;
+struct slver sha512_ctx_mgr_submit_avx_slver = { 0x0167, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_flush_avx_slver_02020168;
+struct slver sha512_ctx_mgr_flush_avx_slver = { 0x0168, 0x02, 0x02 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c
new file mode 100644
index 000000000..9923e2097
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c
@@ -0,0 +1,269 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx2(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx2(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx2(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx2_slver_04020169;
+struct slver sha512_ctx_mgr_init_avx2_slver = { 0x0169, 0x02, 0x04 };
+
+struct slver sha512_ctx_mgr_submit_avx2_slver_04020170;
+struct slver sha512_ctx_mgr_submit_avx2_slver = { 0x0170, 0x02, 0x04 };
+
+struct slver sha512_ctx_mgr_flush_avx2_slver_04020171;
+struct slver sha512_ctx_mgr_flush_avx2_slver = { 0x0171, 0x02, 0x04 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c
new file mode 100644
index 000000000..5c0757716
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c
@@ -0,0 +1,274 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx512(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx512(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx512(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx512_slver_0600016a;
+struct slver sha512_ctx_mgr_init_avx512_slver = { 0x016a, 0x00, 0x06 };
+
+struct slver sha512_ctx_mgr_submit_avx512_slver_0600016b;
+struct slver sha512_ctx_mgr_submit_avx512_slver = { 0x016b, 0x00, 0x06 };
+
+struct slver sha512_ctx_mgr_flush_avx512_slver_0600016c;
+struct slver sha512_ctx_mgr_flush_avx512_slver = { 0x016c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c
new file mode 100644
index 000000000..61a8fa000
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base.c
@@ -0,0 +1,323 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+/* From the FIPS, these are the same as for SHA256, but operating on 64 bit words
+ * instead of 32 bit.
+ */
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+
+/* Sigma functions have same form as SHA256 but
+ * - change the word size to 64bit
+ * - change the amount to rotate
+ */
+#define ror64(x, r) (((x)>>(r)) ^ ((x)<<(64-(r))))
+
+/* Technically, s0 should be S0 as these are "capital sigma" functions, and likewise the case
+ * of the S0 should be s0, but keep as-is to avoid confusion with the other reference functions.
+ */
+#define s0(a) (ror64(a,28) ^ ror64(a,34) ^ ror64(a,39))
+#define s1(e) (ror64(e,14) ^ ror64(e,18) ^ ror64(e,41))
+
+#define S0(w) (ror64(w,1) ^ ror64(w,8) ^ (w >> 7))
+#define S1(w) (ror64(w,19) ^ ror64(w,61) ^ (w >> 6))
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be64(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+static void sha512_init(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t sha512_update(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void sha512_final(SHA512_HASH_CTX * ctx, uint32_t remain_len);
+static void sha512_single(const void *data, uint64_t digest[]);
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+
+void sha512_ctx_mgr_init_base(SHA512_HASH_CTX_MGR * mgr)
+{
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_base(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ uint32_t remain_len;
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+ // Cannot submit a new entire job to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags == HASH_FIRST) {
+
+ sha512_init(ctx, buffer, len);
+ sha512_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_UPDATE) {
+ sha512_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_LAST) {
+ remain_len = sha512_update(ctx, buffer, len);
+ sha512_final(ctx, remain_len);
+ }
+
+ if (flags == HASH_ENTIRE) {
+ sha512_init(ctx, buffer, len);
+ remain_len = sha512_update(ctx, buffer, len);
+ sha512_final(ctx, remain_len);
+ }
+
+ return ctx;
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_base(SHA512_HASH_CTX_MGR * mgr)
+{
+ return NULL;
+}
+
+static void sha512_init(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Mark it as processing
+ ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sha512_update(SHA512_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ uint32_t remain_len = len;
+ uint64_t *digest = ctx->job.result_digest;
+
+ while (remain_len >= SHA512_BLOCK_SIZE) {
+ sha512_single(buffer, digest);
+ buffer = (void *)((uint8_t *) buffer + SHA512_BLOCK_SIZE);
+ remain_len -= SHA512_BLOCK_SIZE;
+ ctx->total_length += SHA512_BLOCK_SIZE;
+ }
+ ctx->status = HASH_CTX_STS_IDLE;
+ ctx->incoming_buffer = buffer;
+ return remain_len;
+}
+
+static void sha512_final(SHA512_HASH_CTX * ctx, uint32_t remain_len)
+{
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t i = remain_len, j;
+ uint8_t buf[2 * SHA512_BLOCK_SIZE];
+ uint64_t *digest = ctx->job.result_digest;
+
+ ctx->total_length += i;
+ memcpy(buf, buffer, i);
+ buf[i++] = 0x80;
+ for (j = i; j < (2 * SHA512_BLOCK_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA512_BLOCK_SIZE - SHA512_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA512_BLOCK_SIZE;
+ else
+ i = SHA512_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+ sha512_single(buf, digest);
+ if (i == 2 * SHA512_BLOCK_SIZE) {
+ sha512_single(buf + SHA512_BLOCK_SIZE, digest);
+ }
+
+ ctx->status = HASH_CTX_STS_COMPLETE;
+}
+
+void sha512_single(const void *data, uint64_t digest[])
+{
+ /* Check these are all uint64_t */
+ uint64_t a, b, c, d, e, f, g, h, t1, t2;
+ uint64_t w[16];
+ uint64_t *ww = (uint64_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22);
+ step(1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f);
+ step(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df);
+ step(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c851482353b);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218);
+ step(45, d, e, f, g, h, a, b, c, 0xd69906245565a910);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b); // step 63
+ step(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c);
+ step(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207);
+ step(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e);
+ step(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178);
+ step(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba);
+ step(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6);
+ step(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae);
+ step(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b);
+ step(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84);
+ step(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493);
+ step(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc);
+ step(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c);
+ step(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6);
+ step(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a);
+ step(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec);
+ step(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817); // step 79
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_base_slver_000002f3;
+struct slver sha512_ctx_mgr_init_base_slver = { 0x02f3, 0x00, 0x00 };
+
+struct slver sha512_ctx_mgr_submit_base_slver_000002f4;
+struct slver sha512_ctx_mgr_submit_base_slver = { 0x02f4, 0x00, 0x00 };
+
+struct slver sha512_ctx_mgr_flush_base_slver_000002f5;
+struct slver sha512_ctx_mgr_flush_base_slver = { 0x02f5, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c
new file mode 100644
index 000000000..9890c2c47
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+extern void sha512_ctx_mgr_init_base(SHA512_HASH_CTX_MGR * mgr);
+extern SHA512_HASH_CTX *sha512_ctx_mgr_submit_base(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx, const void *buffer,
+ uint32_t len, HASH_CTX_FLAG flags);
+extern SHA512_HASH_CTX *sha512_ctx_mgr_flush_base(SHA512_HASH_CTX_MGR * mgr);
+
+void sha512_ctx_mgr_init(SHA512_HASH_CTX_MGR * mgr)
+{
+ return sha512_ctx_mgr_init_base(mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ return sha512_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush(SHA512_HASH_CTX_MGR * mgr)
+{
+ return sha512_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c
new file mode 100644
index 000000000..94c32d260
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c
@@ -0,0 +1,255 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_sb_sse4(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_sb_mgr_init_sse4(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_sb_sse4(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx, const void *buffer,
+ uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_sb_sse4(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_flush_sse4(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_sb_sse4_slver_05020172;
+struct slver sha512_ctx_mgr_init_sb_sse4_slver = { 0x0172, 0x02, 0x05 };
+
+struct slver sha512_ctx_mgr_submit_sb_sse4_slver_05020173;
+struct slver sha512_ctx_mgr_submit_sb_sse4_slver = { 0x0173, 0x02, 0x05 };
+
+struct slver sha512_ctx_mgr_flush_sb_sse4_slver_05020174;
+struct slver sha512_ctx_mgr_flush_sb_sse4_slver = { 0x0174, 0x02, 0x05 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c
new file mode 100644
index 000000000..b73619875
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c
@@ -0,0 +1,255 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_sse(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_sse(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_sse(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SHA512_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_sse_slver_00020163;
+struct slver sha512_ctx_mgr_init_sse_slver = { 0x0163, 0x02, 0x00 };
+
+struct slver sha512_ctx_mgr_submit_sse_slver_00020164;
+struct slver sha512_ctx_mgr_submit_sse_slver = { 0x0164, 0x02, 0x00 };
+
+struct slver sha512_ctx_mgr_flush_sse_slver_00020165;
+struct slver sha512_ctx_mgr_flush_sse_slver = { 0x0165, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm
new file mode 100644
index 000000000..4423cdcb5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm
@@ -0,0 +1,54 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA512_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; JOB_SHA512
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 8, 8 ; length in bytes
+FIELD _result_digest, 8*8, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+
+%assign _SHA512_JOB_size _FIELD_OFFSET
+%assign _SHA512_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..f54135da3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm
@@ -0,0 +1,72 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA512 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA512_ARGS_X8
+;;; name size align
+FIELD _digest, 8*8*8, 4 ; transposed digest
+FIELD _data_ptr, 8*8, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SHA512_ARGS_X4_size _FIELD_OFFSET
+%assign _SHA512_ARGS_X4_align _STRUCT_ALIGN
+%assign _SHA512_ARGS_X8_size _FIELD_OFFSET
+%assign _SHA512_ARGS_X8_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align
+FIELD _lens, 8*8, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..65ce43d3a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm
@@ -0,0 +1,224 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_avx
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha512_mb_mgr_flush_avx, function
+sha512_mb_mgr_flush_avx:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 16+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 2
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*32]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*32]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*32]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*32]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..33a24a6b9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm
@@ -0,0 +1,245 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x4_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*5
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.xmm
+%define _GPR_SAVE stack_frame.gpr
+%define STACK_SPACE stack_frame_size
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx2(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha512_mb_mgr_flush_avx2, function
+sha512_mb_mgr_flush_avx2:
+ endbranch
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+ and rsp, ~31
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov lens2, [state + _lens + 2*8]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov lens3, [state + _lens + 3*8]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+ mov [state + _lens + 2*8], lens2
+ mov [state + _lens + 3*8], lens3
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x4_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*32]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*32]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*32]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*32]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..795027c6b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm
@@ -0,0 +1,270 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern sha512_mb_x8_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define num_lanes_inuse r9
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define num_lanes_inuse r9
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*8
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.xmm
+%define _GPR_SAVE stack_frame.gpr
+%define STACK_SPACE stack_frame_size
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx512(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha512_mb_mgr_flush_avx512, function
+sha512_mb_mgr_flush_avx512:
+ endbranch
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqu [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqu [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqu [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqu [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqu [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqu [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqu [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqu [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqu [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqu [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 7
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx
+ vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a}
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i}
+ vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i}
+ vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword
+
+ vmovq idx, xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 32 ; SHA512 blocksize is 1024bit
+ jz len_is_0
+
+ vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i}
+ vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0}
+ vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0}
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x8_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*64]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*64]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*64]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*64]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqu xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqu xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqu xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqu xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqu xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqu xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqu xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqu xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqu xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=32
+
+align 32
+clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index
+ dq 0xFFFFFFFF00000000, 0x0000000000000000
+ dq 0xFFFFFFFF00000000, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_mgr_flush_avx512
+no_sha512_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..8a58bf879
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm
@@ -0,0 +1,227 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_sse
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_sse(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sha512_mb_mgr_flush_sse, function
+sha512_mb_mgr_flush_sse:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 16+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 2
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x2_sse
+ ; state and idx are intact
+
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movq xmm0, [state + _args_digest + 8*idx + 0*32]
+ pinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ movq xmm1, [state + _args_digest + 8*idx + 2*32]
+ pinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ movq xmm2, [state + _args_digest + 8*idx + 4*32]
+ pinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ movq xmm3, [state + _args_digest + 8*idx + 6*32]
+ pinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+ movdqa [job_rax + _result_digest + 2*16], xmm2
+ movdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..7ca997653
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c
@@ -0,0 +1,45 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_avx2(SHA512_MB_JOB_MGR * state)
+{
+ unsigned int j;
+
+ state->lens[0] = 0;
+ state->lens[1] = 1;
+ state->lens[2] = 2;
+ state->lens[3] = 3;
+ state->unused_lanes = 0xFF03020100;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA512_X4_LANES; j++) {
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..bca9549d9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c
@@ -0,0 +1,42 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_avx512(SHA512_MB_JOB_MGR * state)
+{
+ unsigned int j;
+
+ state->unused_lanes = 0x0706050403020100;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA512_MAX_LANES; j++) {
+ state->lens[j] = j; // sha512_mb uses low 32bit of lens to hold idx exclusively
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c
new file mode 100644
index 000000000..0e9ec257f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c
@@ -0,0 +1,43 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_sse(SHA512_MB_JOB_MGR * state)
+{
+ unsigned int j;
+
+ state->lens[0] = 0;
+ state->lens[1] = 1;
+ state->unused_lanes = 0xFF0100;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA512_MIN_LANES; j++) {
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..1e3b1b1bd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm
@@ -0,0 +1,262 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_avx
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*5
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.gpr
+%define _GPR_SAVE stack_frame.rsp
+%define STACK_SPACE stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha512_mb_mgr_submit_avx, function
+sha512_mb_mgr_submit_avx:
+ endbranch
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+ and rsp, ~31
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ vmovdqa [rsp + 16*7], xmm13
+ vmovdqa [rsp + 16*8], xmm14
+ vmovdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+ ; Load digest words from result_digest
+ vmovdqa xmm0, [job + _result_digest + 0*16]
+ vmovdqa xmm1, [job + _result_digest + 1*16]
+ vmovdqa xmm2, [job + _result_digest + 2*16]
+ vmovdqa xmm3, [job + _result_digest + 3*16]
+ vmovq [state + _args_digest + 8*lane + 0*32], xmm0
+ vpextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1
+ vmovq [state + _args_digest + 8*lane + 2*32], xmm1
+ vpextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1
+ vmovq [state + _args_digest + 8*lane + 4*32], xmm2
+ vpextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1
+ vmovq [state + _args_digest + 8*lane + 6*32], xmm3
+ vpextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xff
+ jne return_null
+
+start_loop:
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x2_avx
+ ; state and idx are intact
+
+len_is_0:
+
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*32]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*32]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*32]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*32]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ vmovdqa xmm13, [rsp + 16*7]
+ vmovdqa xmm14, [rsp + 16*8]
+ vmovdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..c425c5bb9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm
@@ -0,0 +1,270 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x4_avx2
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*5
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.gpr
+%define _GPR_SAVE stack_frame.rsp
+%define STACK_SPACE stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx2(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha512_mb_mgr_submit_avx2, function
+sha512_mb_mgr_submit_avx2:
+ endbranch
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+ and rsp, ~31
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ vmovdqa [rsp + 16*7], xmm13
+ vmovdqa [rsp + 16*8], xmm14
+ vmovdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+ ; Load digest words from result_digest
+ vmovdqa xmm0, [job + _result_digest + 0*16]
+ vmovdqa xmm1, [job + _result_digest + 1*16]
+ vmovdqa xmm2, [job + _result_digest + 2*16]
+ vmovdqa xmm3, [job + _result_digest + 3*16]
+ vmovq [state + _args_digest + 8*lane + 0*32], xmm0
+ vpextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1
+ vmovq [state + _args_digest + 8*lane + 2*32], xmm1
+ vpextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1
+ vmovq [state + _args_digest + 8*lane + 4*32], xmm2
+ vpextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1
+ vmovq [state + _args_digest + 8*lane + 6*32], xmm3
+ vpextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xff
+ jne return_null
+
+start_loop:
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov lens2, [state + _lens + 2*8]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov lens3, [state + _lens + 3*8]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+ mov [state + _lens + 2*8], lens2
+ mov [state + _lens + 3*8], lens3
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x4_avx2
+ ; state and idx are intact
+
+len_is_0:
+
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*32]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*32]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*32]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*32]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ vmovdqa xmm13, [rsp + 16*7]
+ vmovdqa xmm14, [rsp + 16*8]
+ vmovdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..23b1b5c27
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm
@@ -0,0 +1,280 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern sha512_mb_x8_avx512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define num_lanes_inuse r9
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*8
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.gpr
+%define _GPR_SAVE stack_frame.rsp
+%define STACK_SPACE stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx512(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha512_mb_mgr_submit_avx512, function
+sha512_mb_mgr_submit_avx512:
+ endbranch
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+ mov [rsp + _XMM_SAVE + 8*5], r13
+ mov [rsp + _XMM_SAVE + 8*6], r14
+ mov [rsp + _XMM_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ vmovdqu [rsp + 16*0], xmm6
+ vmovdqu [rsp + 16*1], xmm7
+ vmovdqu [rsp + 16*2], xmm8
+ vmovdqu [rsp + 16*3], xmm9
+ vmovdqu [rsp + 16*4], xmm10
+ vmovdqu [rsp + 16*5], xmm11
+ vmovdqu [rsp + 16*6], xmm12
+ vmovdqu [rsp + 16*7], xmm13
+ vmovdqu [rsp + 16*8], xmm14
+ vmovdqu [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+ ; Load digest words from result_digest
+ vmovdqa xmm0, [job + _result_digest + 0*16]
+ vmovdqa xmm1, [job + _result_digest + 1*16]
+ vmovdqa xmm2, [job + _result_digest + 2*16]
+ vmovdqa xmm3, [job + _result_digest + 3*16]
+ vmovq [state + _args_digest + 8*lane + 0*64], xmm0
+ vpextrq [state + _args_digest + 8*lane + 1*64], xmm0, 1
+ vmovq [state + _args_digest + 8*lane + 2*64], xmm1
+ vpextrq [state + _args_digest + 8*lane + 3*64], xmm1, 1
+ vmovq [state + _args_digest + 8*lane + 4*64], xmm2
+ vpextrq [state + _args_digest + 8*lane + 5*64], xmm2, 1
+ vmovq [state + _args_digest + 8*lane + 6*64], xmm3
+ vpextrq [state + _args_digest + 8*lane + 7*64], xmm3, 1
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 8
+ jne return_null
+
+start_loop:
+ ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx
+ vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a}
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i}
+ vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i}
+ vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword
+
+ vmovq idx, xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 32
+ jz len_is_0
+
+
+ vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i}
+ vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0}
+ vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0}
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x8_avx512
+ ; state and idx are intact
+
+len_is_0:
+
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*64]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*64]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*64]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*64]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + 16*0]
+ vmovdqu xmm7, [rsp + 16*1]
+ vmovdqu xmm8, [rsp + 16*2]
+ vmovdqu xmm9, [rsp + 16*3]
+ vmovdqu xmm10, [rsp + 16*4]
+ vmovdqu xmm11, [rsp + 16*5]
+ vmovdqu xmm12, [rsp + 16*6]
+ vmovdqu xmm13, [rsp + 16*7]
+ vmovdqu xmm14, [rsp + 16*8]
+ vmovdqu xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov r13, [rsp + _XMM_SAVE + 8*5]
+ mov r14, [rsp + _XMM_SAVE + 8*6]
+ mov r15, [rsp + _XMM_SAVE + 8*7]
+
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=32
+
+align 32
+clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index
+ dq 0xFFFFFFFF00000000, 0x0000000000000000
+ dq 0xFFFFFFFF00000000, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_mgr_submit_avx512
+no_sha512_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..ba12d586b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm
@@ -0,0 +1,260 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_sse
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*5
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.gpr
+%define _GPR_SAVE stack_frame.rsp
+%define STACK_SPACE stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_sse(SHA512_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sha512_mb_mgr_submit_sse, function
+sha512_mb_mgr_submit_sse:
+ endbranch
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+ and rsp, ~31
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ movdqa [rsp + 16*0], xmm6
+ movdqa [rsp + 16*1], xmm7
+ movdqa [rsp + 16*2], xmm8
+ movdqa [rsp + 16*3], xmm9
+ movdqa [rsp + 16*4], xmm10
+ movdqa [rsp + 16*5], xmm11
+ movdqa [rsp + 16*6], xmm12
+ movdqa [rsp + 16*7], xmm13
+ movdqa [rsp + 16*8], xmm14
+ movdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4 + 8*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ movdqa xmm1, [job + _result_digest + 1*16]
+ movdqa xmm2, [job + _result_digest + 2*16]
+ movdqa xmm3, [job + _result_digest + 3*16]
+ movq [state + _args_digest + 8*lane + 0*32], xmm0
+ pextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1
+ movq [state + _args_digest + 8*lane + 2*32], xmm1
+ pextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1
+ movq [state + _args_digest + 8*lane + 4*32], xmm2
+ pextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1
+ movq [state + _args_digest + 8*lane + 6*32], xmm3
+ pextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xff
+ jne return_null
+
+start_loop:
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x2_sse
+ ; state and idx are intact
+
+len_is_0:
+
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ movq xmm0, [state + _args_digest + 8*idx + 0*32]
+ pinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ movq xmm1, [state + _args_digest + 8*idx + 2*32]
+ pinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ movq xmm2, [state + _args_digest + 8*idx + 4*32]
+ pinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ movq xmm3, [state + _args_digest + 8*idx + 6*32]
+ pinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+ movdqa [job_rax + _result_digest + 2*16], xmm2
+ movdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 16*0]
+ movdqa xmm7, [rsp + 16*1]
+ movdqa xmm8, [rsp + 16*2]
+ movdqa xmm9, [rsp + 16*3]
+ movdqa xmm10, [rsp + 16*4]
+ movdqa xmm11, [rsp + 16*5]
+ movdqa xmm12, [rsp + 16*6]
+ movdqa xmm13, [rsp + 16*7]
+ movdqa xmm14, [rsp + 16*8]
+ movdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c
new file mode 100644
index 000000000..74fa0384a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c
@@ -0,0 +1,160 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha512_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ int ret;
+
+ printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha512_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ SHA512(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sha512 test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be64(((uint64_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be64(((uint64_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha512_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Random buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ SHA512(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sha512 test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be64(((uint64_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be64(((uint64_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha512_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c
new file mode 100644
index 000000000..f71d06df8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c
@@ -0,0 +1,203 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+ int ret;
+
+ printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha512_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha512_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_sha512 test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%016lX <=> 0x%016lX \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha512_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sha512_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sha512_mb test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%016lX <=> 0x%016lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sha512_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sha512_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_sha512 test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%016lX"
+ ", ref: 0x%016lX\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha512 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c
new file mode 100644
index 000000000..383c45cd2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c
@@ -0,0 +1,300 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SHA512_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA512_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS];
+
+extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+ int ret;
+
+ printf("multibinary_sha512_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha512_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha512_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_sha512 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha512_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha512_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha512_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8lX <=> %8lX",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sha512_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ sha512_ctx_mgr_init(mgr);
+
+ // Run sha512_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SHA512_BLOCK_SIZE +
+ SHA512_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SHA512_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem,
+ HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha512_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha512_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SHA512_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha512_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8lX <=> %8lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha512_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c
new file mode 100644
index 000000000..a84e7af3e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c
@@ -0,0 +1,270 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha512_mb.h"
+
+typedef uint64_t DigestSHA512[SHA512_DIGEST_NWORDS];
+
+#define MSGS 8
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "The quick brown fox jumps over the lazy dog";
+static uint8_t msg2[] = "The quick brown fox jumps over the lazy dog.";
+static uint8_t msg3[] = { 0x0a, 0x55, 0xdb, 0 };
+static uint8_t msg4[] = { 0xba, 0xd7, 0xc6, 0x18, 0xf4, 0x5b, 0xe2, 0x07, 0x97, 0x5e, 0 };
+
+static uint8_t msg5[] = {
+ 0xb1, 0x71, 0x5f, 0x78, 0x2f, 0xf0, 0x2c, 0x6b, 0x88, 0x93,
+ 0x7f, 0x05, 0x41, 0x16, 0
+};
+
+static uint8_t msg6[] = {
+ 0xc6, 0xa1, 0x70, 0x93, 0x65, 0x68, 0x65, 0x10, 0x20, 0xed,
+ 0xfe, 0x15, 0xdf, 0x80, 0x12, 0xac, 0xda, 0x8d, 0
+};
+
+static uint8_t msg7[] = {
+ 0xa8, 0xa3, 0x7d, 0xfc, 0x08, 0x3a, 0xd2, 0xf4, 0x7f, 0xff,
+ 0x46, 0x87, 0x38, 0xbf, 0x8b, 0x72, 0x8e, 0xb7, 0xf1, 0x90,
+ 0x7e, 0x42, 0x7f, 0xa1, 0x5c, 0xb4, 0x42, 0x4b, 0xc6, 0x85,
+ 0xe5, 0x5e, 0xd7, 0xb2, 0x82, 0x5c, 0x9c, 0x60, 0xb8, 0x39,
+ 0xcc, 0xc2, 0xfe, 0x5f, 0xb3, 0x3e, 0x36, 0xf5, 0x70, 0xcb,
+ 0x86, 0x61, 0x60, 0x9e, 0x63, 0x0b, 0xda, 0x05, 0xee, 0x64,
+ 0x1d, 0x93, 0x84, 0x28, 0x86, 0x7d, 0x90, 0xe0, 0x07, 0x44,
+ 0xa4, 0xaa, 0xd4, 0x94, 0xc9, 0x3c, 0x5f, 0x6d, 0x13, 0x27,
+ 0x87, 0x80, 0x78, 0x59, 0x0c, 0xdc, 0xe1, 0xe6, 0x47, 0xc9,
+ 0x82, 0x08, 0x18, 0xf4, 0x67, 0x64, 0x1f, 0xcd, 0x50, 0x8e,
+ 0x2f, 0x2e, 0xbf, 0xd0, 0xff, 0x3d, 0x4f, 0x27, 0x23, 0x93,
+ 0x47, 0x8f, 0x3b, 0x9e, 0x6f, 0x80, 0x6b, 0x43, 0
+};
+
+static uint8_t msg8[] = "";
+
+static DigestSHA512 expResultDigest1 = {
+ 0x07e547d9586f6a73, 0xf73fbac0435ed769, 0x51218fb7d0c8d788, 0xa309d785436bbb64,
+ 0x2e93a252a954f239, 0x12547d1e8a3b5ed6, 0xe1bfd7097821233f, 0xa0538f3db854fee6
+};
+
+static DigestSHA512 expResultDigest2 = {
+ 0x91ea1245f20d46ae, 0x9a037a989f54f1f7, 0x90f0a47607eeb8a1, 0x4d12890cea77a1bb,
+ 0xc6c7ed9cf205e67b, 0x7f2b8fd4c7dfd3a7, 0xa8617e45f3c463d4, 0x81c7e586c39ac1ed
+};
+
+static DigestSHA512 expResultDigest3 = {
+ 0x7952585e5330cb24, 0x7d72bae696fc8a6b, 0x0f7d0804577e347d, 0x99bc1b11e52f3849,
+ 0x85a428449382306a, 0x89261ae143c2f3fb, 0x613804ab20b42dc0, 0x97e5bf4a96ef919b
+};
+
+static DigestSHA512 expResultDigest4 = {
+ 0x5886828959d1f822, 0x54068be0bd14b6a8, 0x8f59f534061fb203, 0x76a0541052dd3635,
+ 0xedf3c6f0ca3d0877, 0x5e13525df9333a21, 0x13c0b2af76515887, 0x529910b6c793c8a5
+};
+
+static DigestSHA512 expResultDigest5 = {
+ 0xee1a56ee78182ec4, 0x1d2c3ab33d4c4187, 0x1d437c5c1ca060ee, 0x9e219cb83689b4e5,
+ 0xa4174dfdab5d1d10, 0x96a31a7c8d3abda7, 0x5c1b5e6da97e1814, 0x901c505b0bc07f25
+};
+
+static DigestSHA512 expResultDigest6 = {
+ 0xc36c100cdb6c8c45, 0xb072f18256d63a66, 0xc9843acb4d07de62, 0xe0600711d4fbe64c,
+ 0x8cf314ec3457c903, 0x08147cb7ac7e4d07, 0x3ba10f0ced78ea72, 0x4a474b32dae71231
+};
+
+static DigestSHA512 expResultDigest7 = {
+ 0x8e1c91729be8eb40, 0x226f6c58a029380e, 0xf7edb9dc166a5c3c, 0xdbcefe90bd30d85c,
+ 0xb7c4b248e66abf0a, 0x3a4c842281299bef, 0x6db88858d9e5ab52, 0x44f70b7969e1c072
+};
+
+static DigestSHA512 expResultDigest8 = {
+ 0Xcf83e1357eefb8bd, 0Xf1542850d66d8007, 0Xd620e4050b5715dc, 0X83f4a921d36ce9ce,
+ 0X47d0d13c5d85f2b0, 0Xff8318d2877eec2f, 0X63b931bd47417a81, 0Xa538327af927da3e
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8 };
+
+static uint64_t *expResultDigest[MSGS] = { expResultDigest1, expResultDigest2,
+ expResultDigest3, expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7, expResultDigest8
+};
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint64_t *good;
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha512_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %016lX, "
+ "should be %016lX\n", t, j,
+ ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+ }
+ }
+
+ while (1) {
+ ctx = sha512_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %016lX, "
+ "should be %016lX\n", t, j,
+ ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the "
+ "submit. Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %016lX, "
+ "should be %016lX\n", t, j,
+ ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sha512_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %016lX, "
+ "should be %016lX\n", t, j,
+ ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sha512 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..87d7837f6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c
@@ -0,0 +1,129 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha512_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 1000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 10
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS];
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sha512_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA512(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha512_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha512" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be64(((uint64_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_be64(((uint64_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sha512 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf("multibinary_sha512_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm
new file mode 100644
index 000000000..5d443faf7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm
@@ -0,0 +1,442 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute SHA512 by-2 using AVX
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux definitions
+%define arg1 rdi
+%define arg2 rsi
+%else
+; Windows definitions
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND r8
+%define TBL r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA: resb SZ2 * 16
+_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define VMOVPD vmovupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+ vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1
+ vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0
+%endm
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsllq %%tmp, %%reg, (64-(%%imm))
+ vpsrlq %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsllq %%tmp, %%src, (64-(%%imm))
+ vpsrlq %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+ PRORQ_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41)
+ vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1
+ vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ vpaddq h, h, a2 ; h = h + ch
+ PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6)
+ vpaddq h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ vmovdqa %%T1, a ; maj: T1 = a
+ PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39)
+ vpxor %%T1, %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ2 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddq h, h, a0
+
+ vpaddq d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddq h, h, a1 ; h = h + ch + W + K + maj
+ vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA]
+ vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA]
+ vmovdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ vmovdqa a2, a1
+ PRORQ a1, 61-19
+ vpxor %%T1, %%T1, a0
+ PRORQ %%T1, 1
+ vpxor a1, a1, a2
+ PRORQ a1, 19
+ vpsrlq a0, a0, 7
+ vpxor %%T1, %%T1, a0
+ vpsrlq a2, a2, 6
+ vpxor a1, a1, a2
+ vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA]
+ vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA]
+ vpaddq %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+;; void sha512_mb_x2_avx(SHA512_MB_ARGS_X4 *args, uint64_t msg_size_in_blocks)
+;; arg 1 : STATE : pointer args (only 2 of the 4 lanes used)
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+mk_global sha512_mb_x2_avx, function, internal
+align 32
+sha512_mb_x2_avx:
+ endbranch
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+ lea TBL,[K512_2_MB]
+
+ ;; load the address of each of the 2 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ2], a
+ vmovdqa [rsp + _DIGEST + 1*SZ2], b
+ vmovdqa [rsp + _DIGEST + 2*SZ2], c
+ vmovdqa [rsp + _DIGEST + 3*SZ2], d
+ vmovdqa [rsp + _DIGEST + 4*SZ2], e
+ vmovdqa [rsp + _DIGEST + 5*SZ2], f
+ vmovdqa [rsp + _DIGEST + 6*SZ2], g
+ vmovdqa [rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+ ;; load up the shuffler for little-endian to big-endian format
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+ VMOVPD TT2,[inp1+IDX+i*16]
+
+ TRANSPOSE TT0, TT2, TT1
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+
+ ROUND_00_15 TT0,(i*2+0)
+ ROUND_00_15 TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+ add IDX, 8 * 16
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddq a, a, [rsp + _DIGEST + 0*SZ2]
+ vpaddq b, b, [rsp + _DIGEST + 1*SZ2]
+ vpaddq c, c, [rsp + _DIGEST + 2*SZ2]
+ vpaddq d, d, [rsp + _DIGEST + 3*SZ2]
+ vpaddq e, e, [rsp + _DIGEST + 4*SZ2]
+ vpaddq f, f, [rsp + _DIGEST + 5*SZ2]
+ vpaddq g, g, [rsp + _DIGEST + 6*SZ2]
+ vpaddq h, h, [rsp + _DIGEST + 7*SZ2]
+
+ sub INP_SIZE, 1 ;; consumed one message block
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a
+ vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b
+ vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c
+ vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d
+ vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e
+ vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f
+ vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g
+ vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, STACK_size
+
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+section .data
+K512_2_MB:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817
+
+
+align 32
+; one from sha512_rorx
+; this does the big endian to little endian conversion
+; over a quad word
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm
new file mode 100644
index 000000000..6c658023f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm
@@ -0,0 +1,424 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute SHA512 by-2 using SSE
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND r8
+%define TBL r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA: resb SZ2 * 16
+_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define MOVPD movupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+ movapd %%t0, %%r0 ; t0 = a1 a0
+ shufpd %%r0, %%r1, 00b ; r0 = b0 a0
+ shufpd %%t0, %%r1, 11b ; t0 = b1 a1
+%endm
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psllq %%tmp, (64-(%%imm))
+ psrlq %%reg, %%imm
+ por %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORQ a0, (18-14) ; sig1: a0 = (e >> 4)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORQ a1, 41 ; sig1: a1 = (e >> 41)
+ movdqa [SZ2*(%%i&0xf) + rsp],%%T1
+ paddq %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ paddq h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORQ a2, (34-28) ; sig0: a2 = (a >> 6)
+ paddq h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORQ a1, 39 ; sig0: a1 = (a >> 39)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ2 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddq h, a0
+
+ paddq d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddq h, a1 ; h = h + ch + W + K + maj
+ paddq h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ movdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp]
+ movdqa a1, [SZ2*((%%i-2)&0xf) + rsp]
+ movdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ movdqa a2, a1
+ PRORQ a1, 61-19
+ pxor %%T1, a0
+ PRORQ %%T1, 1
+ pxor a1, a2
+ PRORQ a1, 19
+ psrlq a0, 7
+ pxor %%T1, a0
+ psrlq a2, 6
+ pxor a1, a2
+ paddq %%T1, [SZ2*((%%i-16)&0xf) + rsp]
+ paddq a1, [SZ2*((%%i-7)&0xf) + rsp]
+ paddq %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+;; void sha512_x2_sse(SHA512_MB_ARGS_X4 *args, uint64_t num_blocks);
+;; arg 1 : STATE : pointer args (only 2 of the 4 lanes used)
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+mk_global sha512_mb_x2_sse, function, internal
+align 32
+sha512_mb_x2_sse:
+ endbranch
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ movdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+ movdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+ movdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+ movdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+ movdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+ movdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+ movdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+ movdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+ lea TBL,[K512_2_MB]
+
+ ;; load the address of each of the 2 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+ ;; save old digest
+ movdqa [rsp + _DIGEST + 0*SZ2], a
+ movdqa [rsp + _DIGEST + 1*SZ2], b
+ movdqa [rsp + _DIGEST + 2*SZ2], c
+ movdqa [rsp + _DIGEST + 3*SZ2], d
+ movdqa [rsp + _DIGEST + 4*SZ2], e
+ movdqa [rsp + _DIGEST + 5*SZ2], f
+ movdqa [rsp + _DIGEST + 6*SZ2], g
+ movdqa [rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+ ;; load up the shuffler for little-endian to big-endian format
+ movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ MOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+ MOVPD TT2,[inp1+IDX+i*16]
+ TRANSPOSE TT0, TT2, TT1
+ pshufb TT0, TMP
+ pshufb TT1, TMP
+ ROUND_00_15 TT0,(i*2+0)
+ ROUND_00_15 TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+ add IDX, 8 * 16 ;; increment by a message block
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ paddq a, [rsp + _DIGEST + 0*SZ2]
+ paddq b, [rsp + _DIGEST + 1*SZ2]
+ paddq c, [rsp + _DIGEST + 2*SZ2]
+ paddq d, [rsp + _DIGEST + 3*SZ2]
+ paddq e, [rsp + _DIGEST + 4*SZ2]
+ paddq f, [rsp + _DIGEST + 5*SZ2]
+ paddq g, [rsp + _DIGEST + 6*SZ2]
+ paddq h, [rsp + _DIGEST + 7*SZ2]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ movdqa [STATE + 0*SHA512_DIGEST_ROW_SIZE],a
+ movdqa [STATE + 1*SHA512_DIGEST_ROW_SIZE],b
+ movdqa [STATE + 2*SHA512_DIGEST_ROW_SIZE],c
+ movdqa [STATE + 3*SHA512_DIGEST_ROW_SIZE],d
+ movdqa [STATE + 4*SHA512_DIGEST_ROW_SIZE],e
+ movdqa [STATE + 5*SHA512_DIGEST_ROW_SIZE],f
+ movdqa [STATE + 6*SHA512_DIGEST_ROW_SIZE],g
+ movdqa [STATE + 7*SHA512_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, STACK_size
+ ret
+
+section .data
+align 64
+mk_global K512_2_MB, data, internal
+K512_2_MB:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm
new file mode 100644
index 000000000..0058f33a6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm
@@ -0,0 +1,487 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA512 using AVX2
+;; use YMMs to tackle the larger digest size
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12
+;; Windows preserves: rcx rsi rdi rbp r13 r14 r15
+;;
+;; Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12
+;; Linux preserves: rcx rdx rdi rbp r13 r14 r15
+;;
+;; clobbers ymm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+%else
+; Windows register definitions
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL r8
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define a0 ymm8
+%define a1 ymm9
+%define a2 ymm10
+
+%define TT0 ymm14
+%define TT1 ymm13
+%define TT2 ymm12
+%define TT3 ymm11
+%define TT4 ymm10
+%define TT5 ymm9
+
+%define T1 ymm14
+%define TMP ymm15
+
+%define SZ4 4*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ4
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+ .data resb 16*SZ4
+ .digest resb NUM_SHA512_DIGEST_WORDS*SZ4
+ .align resb 24
+endstruc
+
+%define _DIGEST stack_frame.digest
+
+%define VMOVPD vmovupd
+
+; operates on YMMs
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+; r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+; r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+; r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ ; vshufps does not cross the mid-way boundary and hence is cheaper
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vperm2f128 %%r1, %%r0, %%r2, 0x20; r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+
+ vperm2f128 %%r3, %%r0, %%r2, 0x31; r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+
+ vperm2f128 %%r0, %%t0, %%t1, 0x31; r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+
+ ; now ok to clobber t0
+ vperm2f128 %%t0, %%t0, %%t1, 0x20; t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+
+%endmacro
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsllq %%tmp, %%reg, (64-(%%imm))
+ vpsrlq %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsllq %%tmp, %%src, (64-(%%imm))
+ vpsrlq %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+ PRORQ_nd %1, %3, TMP, %2
+%endmacro
+
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41)
+ vmovdqa [SZ4*(%%i&0xf) + rsp],%%T1
+ vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ vpaddq h, h, a2 ; h = h + ch
+ PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6)
+ vpaddq h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ vmovdqa %%T1, a ; maj: T1 = a
+ PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39)
+ vpxor %%T1, %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddq h, h, a0
+
+ vpaddq d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddq h, h, a1 ; h = h + ch + W + K + maj
+ vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ vmovdqa a2, a1
+ PRORQ a1, 61-19
+ vpxor %%T1, %%T1, a0
+ PRORQ %%T1, 1
+ vpxor a1, a1, a2
+ PRORQ a1, 19
+ vpsrlq a0, a0, 7
+ vpxor %%T1, %%T1, a0
+ vpsrlq a2, a2, 6
+ vpxor a1, a1, a2
+ vpaddq %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ vpaddq a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+ vpaddq %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha512_mb_x4_avx2(SHA512_MB_ARGS_X4 *STATE, const int INP_SIZE)
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+mk_global sha512_mb_x4_avx2, function, internal
+align 32
+sha512_mb_x4_avx2:
+ endbranch
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ sub rsp, stack_frame_size
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE]
+
+
+ lea TBL,[K512_4_MB]
+
+ ;; load the address of each of the MAX_LANES (4) message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ]
+ mov inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ]
+ mov inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ4], a
+ vmovdqa [rsp + _DIGEST + 1*SZ4], b
+ vmovdqa [rsp + _DIGEST + 2*SZ4], c
+ vmovdqa [rsp + _DIGEST + 3*SZ4], d
+ vmovdqa [rsp + _DIGEST + 4*SZ4], e
+ vmovdqa [rsp + _DIGEST + 5*SZ4], f
+ vmovdqa [rsp + _DIGEST + 6*SZ4], g
+ vmovdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ ;; load up the shuffler for little-endian to big-endian format
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPD TT2,[inp0+IDX+i*32]
+ VMOVPD TT1,[inp1+IDX+i*32]
+ VMOVPD TT4,[inp2+IDX+i*32]
+ VMOVPD TT3,[inp3+IDX+i*32]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+ vpshufb TT2, TT2, TMP
+ vpshufb TT3, TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+ add IDX, 4 * 32
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddq a, a, [rsp + _DIGEST + 0*SZ4]
+ vpaddq b, b, [rsp + _DIGEST + 1*SZ4]
+ vpaddq c, c, [rsp + _DIGEST + 2*SZ4]
+ vpaddq d, d, [rsp + _DIGEST + 3*SZ4]
+ vpaddq e, e, [rsp + _DIGEST + 4*SZ4]
+ vpaddq f, f, [rsp + _DIGEST + 5*SZ4]
+ vpaddq g, g, [rsp + _DIGEST + 6*SZ4]
+ vpaddq h, h, [rsp + _DIGEST + 7*SZ4]
+
+ sub INP_SIZE, 1 ;; consumed one message block
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqu [STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a
+ vmovdqu [STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b
+ vmovdqu [STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c
+ vmovdqu [STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d
+ vmovdqu [STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e
+ vmovdqu [STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f
+ vmovdqu [STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g
+ vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h
+
+ ;; update input data pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+ add inp2, IDX
+ mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2
+ add inp3, IDX
+ mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, stack_frame_size
+
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+section .data
+align 64
+K512_4_MB:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm
new file mode 100644
index 000000000..a93fecb1b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm
@@ -0,0 +1,644 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute quad SHA512 using AVX512
+;; use ZMMs to tackle the larger digest size
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rsi
+;;
+;; Linux clobbers: rax rbx rcx rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdx rdi
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi ; usable
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var2 rdx ; arg2
+ %define var1 rcx ; arg3 usable
+ %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+
+%define IDX var1
+%define TBL r8
+
+%define VMOVDQ32 vmovdqu32
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*8
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%define NUM_LANES 8
+%define SZ 8
+%define SZ8 8 * SZ
+%define DIGEST_SZ 8 * SZ8
+%define DIGEST_SAVE NUM_LANES * DIGEST_SZ
+%define RSP_SAVE 1*8
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, NUM_LANES*8*64, 64
+FIELD _RSP, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+; from sha256_fips180-2.pdf
+; define rotates for Sigma function for main loop steps
+%define BIG_SIGMA_0_0 28 ; Sigma0
+%define BIG_SIGMA_0_1 34
+%define BIG_SIGMA_0_2 39
+%define BIG_SIGMA_1_0 14 ; Sigma1
+%define BIG_SIGMA_1_1 18
+%define BIG_SIGMA_1_2 41
+
+; define rotates for Sigma function for scheduling steps
+
+%define SMALL_SIGMA_0_0 1 ; sigma0
+%define SMALL_SIGMA_0_1 8
+%define SMALL_SIGMA_0_2 7
+%define SMALL_SIGMA_1_0 19 ; sigma1
+%define SMALL_SIGMA_1_1 61
+%define SMALL_SIGMA_1_2 6
+
+%define SHA_MAX_ROUNDS 80
+%define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16)
+
+%macro TRANSPOSE8 12
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+%define %%PERM_INDEX1 %11
+%define %%PERM_INDEX2 %12
+
+
+; each x(i) is 32 bits, 16 * 32 = 512 ==> a full digest length, 32 single precision quantities
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+
+ ;; ;;; will not get clobbered
+ vmovdqa32 %%PERM_INDEX1, [TRANSPOSE8_PERM_INDEX_1] ; temp
+ vmovdqa32 %%PERM_INDEX2, [TRANSPOSE8_PERM_INDEX_2] ; temp
+
+ ; process top half (r0..r3) {a...d}
+ vshufpd %%t0, %%r0, %%r1, 0x00 ; t0 = {b6 a6 b4 a4 b2 a2 b0 a0}
+ vshufpd %%r0, %%r0, %%r1, 0xFF ; r0 = {b7 a7 b5 a5 b3 a3 b1 a1}
+ vshufpd %%t1, %%r2, %%r3, 0x00 ; t1 = {d6 c6 d4 c4 d2 c2 d0 c0}
+ vshufpd %%r2, %%r2, %%r3, 0xFF ; r2 = {d7 c7 d5 c5 d3 c3 d1 c1}
+
+ vmovdqa32 %%r1, %%t0 ; r1 and r3 free
+ vpermt2q %%r1, %%PERM_INDEX1,%%t1 ; r1 = {d4 c4 b4 a4 d0 c0 b0 a0}
+ vpermt2q %%t0, %%PERM_INDEX2,%%t1 ; t0 = {d6 c6 b6 a6 d2 c2 b2 a2}
+
+ vmovdqa32 %%t1, %%r0 ; t1 and r3 free
+ vpermt2q %%t1, %%PERM_INDEX1,%%r2 ; t1 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vpermt2q %%r0, %%PERM_INDEX2,%%r2 ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+
+ ;; Likewise for top half ; r2 and r3 free
+ vshufpd %%r2, %%r4, %%r5, 0x00 ; r2 = {f6 e6 f4 e4 f2 e2 f0 e0}
+ vshufpd %%r4, %%r4, %%r5, 0xFF ; r4 = {f7 e7 f5 e5 f3 e3 f1 e1}
+ vshufpd %%r3, %%r6, %%r7, 0x00 ; r3 = {h6 g6 h4 g4 h2 g2 h0 g0}
+ vshufpd %%r6, %%r6, %%r7, 0xFF ; r6 = {h7 g7 h5 g5 h3 g3 h1 g1}
+
+ vmovdqa32 %%r5, %%r2 ; r5 and r7 free
+ vpermt2q %%r5, %%PERM_INDEX1,%%r3 ; r5 = {h4 g4 f4 e4 h0 g0 f0 e0}
+ vpermt2q %%r2, %%PERM_INDEX2,%%r3 ; r2 = {h6 g6 f6 e6 h2 g2 f2 e2}
+
+ vmovdqa32 %%r7, %%r4
+ vpermt2q %%r7, %%PERM_INDEX1,%%r6 ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vpermt2q %%r4, %%PERM_INDEX2,%%r6 ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+
+;;; free r3, r6
+ vshuff64x2 %%r6, %%t0, %%r2, 0xEE ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+ vshuff64x2 %%r2, %%t0, %%r2, 0x44 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+
+;;; t0 and r3 free
+ vshuff64x2 %%r3, %%r0, %%r4, 0x44 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+ vshuff64x2 %%t0, %%r0, %%r4, 0xEE ; t0 = {h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r4, %%r1, %%r5, 0xEE ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+ vshuff64x2 %%r0, %%r1, %%r5, 0x44 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+
+
+ vshuff64x2 %%r5, %%t1, %%r7, 0xEE ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+ vshuff64x2 %%r1, %%t1, %%r7, 0x44 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+
+ ;; will re-order input to avoid move
+ ;vmovdqa32 %%r7, %%t0
+
+ ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+ ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+ ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+ ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+ ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+ ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+ ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+ ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+ ; temp
+ ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+
+
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_28 ^ ROR_34 ^ ROR_39
+;; SIGMA1 = ROR_14 ^ ROR_18 ^ ROR_41
+;; sigma0 = ROR_1 ^ ROR_8 ^ SHR_7
+;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6
+
+;; Main processing loop per round
+;; equivalent to %macro ROUND_00_15 2
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddq T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ ;; compute BIG_SIGMA_1(E)
+ vprorq TMP1, E, BIG_SIGMA_1_0 ; ROR_14(E)
+ vprorq TMP2, E, BIG_SIGMA_1_1 ; ROR_18(E)
+ vprorq TMP3, E, BIG_SIGMA_1_2 ; ROR_41(E)
+ vpternlogq TMP1, TMP2, TMP3, 0x96 ; TMP1 = BIG_SIGMA_1(E)
+ vpternlogq TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddq T1, T1, %%WT ; T1 = T1 + Wt
+ vpaddq T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddq T1, T1, TMP1 ; T1 = T1 + BIG_SIGMA_1(E)
+ vpaddq D, D, T1 ; D = D + T1
+ vprorq H, A, BIG_SIGMA_0_0 ;ROR_28(A)
+ vprorq TMP2, A, BIG_SIGMA_0_1 ;ROR_34(A)
+ vprorq TMP3, A, BIG_SIGMA_0_2 ;ROR_39(A)
+ vmovdqa32 TMP0, A
+ vpternlogq TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogq H, TMP2, TMP3, 0x96 ; H(T2) = BIG_SIGMA_0(A)
+ vpaddq H, H, TMP0 ; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C)
+ vpaddq H, H, T1 ; H(A) = H(T2) + T1
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprorq TMP4, %%WTp14, SMALL_SIGMA_1_0 ; ROR_19(Wt-2)
+ vprorq TMP5, %%WTp14, SMALL_SIGMA_1_1 ; ROR_61(Wt-2)
+ vpsrlq TMP6, %%WTp14, SMALL_SIGMA_1_2 ; SHR_6(Wt-2)
+ vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_1(Wt-2)
+
+ vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2)
+ vpaddq %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7
+
+ vprorq TMP4, %%WTp1, SMALL_SIGMA_0_0 ; ROR_1(Wt-15)
+ vprorq TMP5, %%WTp1, SMALL_SIGMA_0_1 ; ROR_8(Wt-15)
+ vpsrlq TMP6, %%WTp1, SMALL_SIGMA_0_2 ; SHR_7(Wt-15)
+ vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_0(Wt-15)
+
+ vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) +
+ ; Wt-7 + sigma_0(Wt-15) +
+
+%endmacro
+
+align 64
+
+; void sha512_mb_x8_avx512(SHA512_MB_ARGS_X8, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha512_mb_x8_avx512)
+sha512_mb_x8_avx512:
+ endbranch
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _RSP], rax
+ lea TBL,[TABLE]
+
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*8*8]
+ vmovups B, [DIGEST + 1*8*8]
+ vmovups C, [DIGEST + 2*8*8]
+ vmovups D, [DIGEST + 3*8*8]
+ vmovups E, [DIGEST + 4*8*8]
+ vmovups F, [DIGEST + 5*8*8]
+ vmovups G, [DIGEST + 6*8*8]
+ vmovups H, [DIGEST + 7*8*8]
+
+ xor IDX, IDX
+ ;; Read in input data address, saving them in registers because
+ ;; they will serve as variables, which we shall keep incrementing
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+lloop:
+
+ ;; first half of 1024 (need to transpose before use)
+ vmovups W0,[inp0 + IDX ]
+ vmovups W1,[inp1 + IDX ]
+ vmovups W2,[inp2 + IDX ]
+ vmovups W3,[inp3 + IDX ]
+ vmovups W4,[inp4 + IDX ]
+ vmovups W5,[inp5 + IDX ]
+ vmovups W6,[inp6 + IDX ]
+ vmovups TMP0,[inp7 + IDX ]
+ TRANSPOSE8 W0, W1, W2, W3, W4, W5, W6, TMP0, W7, TMP1, TMP2, TMP3
+ ;; second half of 1024 (need to transpose before use)
+ vmovups W8,[inp0 + SZ8 + IDX ]
+ vmovups W9,[inp1 + SZ8 + IDX ]
+ vmovups W10,[inp2 + SZ8 + IDX ]
+ vmovups W11,[inp3 + SZ8 + IDX ]
+ vmovups W12,[inp4 + SZ8 + IDX ]
+ vmovups W13,[inp5 + SZ8 + IDX ]
+ vmovups W14,[inp6 + SZ8 + IDX ]
+ vmovups TMP0,[inp7 + SZ8 + IDX ]
+ TRANSPOSE8 W8, W9, W10, W11, W12, W13, W14, TMP0, W15, TMP1, TMP2, TMP3
+
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ vmovdqa32 TMP3, [TBL] ; First K
+
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 128 ; increment by message block length in bytes
+
+
+
+
+%assign I 0
+%rep 16
+;;; little endian to big endian
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first (max-rounds -16)
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+ ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep SHA_ROUNDS_LESS_16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+ ; Check is this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+ ; Process last 16 rounds
+ ; Read in next block msg data for use in first 16 words of msg sched
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+ ; Add old digest
+ vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ jmp lloop
+
+
+lastLoop:
+ ; Process last 16 rounds
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+;; update into data pointers
+%assign I 0
+%rep 4
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ VMOVDQ32 [DIGEST + 0*8*8], A
+ VMOVDQ32 [DIGEST + 1*8*8], B
+ VMOVDQ32 [DIGEST + 2*8*8], C
+ VMOVDQ32 [DIGEST + 3*8*8], D
+ VMOVDQ32 [DIGEST + 4*8*8], E
+ VMOVDQ32 [DIGEST + 5*8*8], F
+ VMOVDQ32 [DIGEST + 6*8*8], G
+ VMOVDQ32 [DIGEST + 7*8*8], H
+
+ mov rsp, [rsp + _RSP]
+ ret
+
+ section .data
+align 64
+; 80 constants for SHA512
+; replicating for each lane, thus 8*80
+; to aid in SIMD .. space tradeoff for time!
+; local to asm file, used nowhere else
+TABLE:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 64
+; this does the big endian to little endian conversion over a quad word .. ZMM
+;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+ dq 0x2021222324252627, 0x28292a2b2c2d2e2f
+ dq 0x3031323334353637, 0x38393a3b3c3d3e3f
+
+align 64
+TRANSPOSE8_PERM_INDEX_1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+TRANSPOSE8_PERM_INDEX_2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_x8_avx512
+no_sha512_mb_x8_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm
new file mode 100644
index 000000000..1113a1eea
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm
@@ -0,0 +1,252 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+;;;;;
+; mbin_dispatch_init_avoton parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; if AVOTON is true, then use avoton_func instead of sse_func
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+; 5-> AVOTON opt func
+;;;;;
+%macro mbin_dispatch_init_avoton 5
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+ mov eax, 1
+ cpuid
+ lea mbin_rdi, [%5 WRT_OPT]
+ and eax, FLAG_CPUID1_EAX_STEP_MASK
+ cmp eax, FLAG_CPUID1_EAX_AVOTON
+ ; If Avoton, set Avoton symbol and exit
+ cmove mbin_rsi, mbin_rdi
+ je _%1_init_done
+
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+ jne _%1_init_done ; AVX is not available so end
+ mov mbin_rsi, mbin_rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _%1_init_done
+ lea mbin_rsi, [%2 WRT_OPT]
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init6_avoton parameters
+; if AVOTON is true, then use avoton_func instead of sse_func
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> AVOTON opt func
+;;;;;
+%macro mbin_dispatch_init6_avoton 7
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ je _%1_init_done ; Use base function if no SSE4_1
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ lea mbin_rdi, [%7 WRT_OPT]
+ and eax, FLAG_CPUID1_EAX_STEP_MASK
+ cmp eax, FLAG_CPUID1_EAX_AVOTON
+ ; If Avoton, set Avoton symbol and exit
+ cmove mbin_rsi, mbin_rdi
+ je _%1_init_done
+
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+ lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1
+ lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+default rel
+[bits 64]
+
+%define def_wrd dq
+%define wrd_sz qword
+%define arg1 rsi
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha512_ctx_mgr_init_sse
+extern sha512_ctx_mgr_submit_sse
+extern sha512_ctx_mgr_flush_sse
+
+extern sha512_ctx_mgr_init_avx
+extern sha512_ctx_mgr_submit_avx
+extern sha512_ctx_mgr_flush_avx
+
+extern sha512_ctx_mgr_init_avx2
+extern sha512_ctx_mgr_submit_avx2
+extern sha512_ctx_mgr_flush_avx2
+
+extern sha512_ctx_mgr_init_base
+extern sha512_ctx_mgr_submit_base
+extern sha512_ctx_mgr_flush_base
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha512_ctx_mgr_init_avx512
+ extern sha512_ctx_mgr_submit_avx512
+ extern sha512_ctx_mgr_flush_avx512
+%endif
+
+extern sha512_ctx_mgr_init_sb_sse4
+extern sha512_ctx_mgr_submit_sb_sse4
+extern sha512_ctx_mgr_flush_sb_sse4
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha512_ctx_mgr_init
+mbin_interface sha512_ctx_mgr_submit
+mbin_interface sha512_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6 through replacing base by sse version
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_base, \
+ sha512_ctx_mgr_init_sse, sha512_ctx_mgr_init_avx, \
+ sha512_ctx_mgr_init_avx2, sha512_ctx_mgr_init_avx512, \
+ sha512_ctx_mgr_init_sb_sse4
+
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_base, \
+ sha512_ctx_mgr_submit_sse, sha512_ctx_mgr_submit_avx, \
+ sha512_ctx_mgr_submit_avx2, sha512_ctx_mgr_submit_avx512, \
+ sha512_ctx_mgr_submit_sb_sse4
+
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_base, \
+ sha512_ctx_mgr_flush_sse, sha512_ctx_mgr_flush_avx, \
+ sha512_ctx_mgr_flush_avx2, sha512_ctx_mgr_flush_avx512, \
+ sha512_ctx_mgr_flush_sb_sse4
+%else
+ mbin_dispatch_init_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_sse, \
+ sha512_ctx_mgr_init_avx, sha512_ctx_mgr_init_avx2, \
+ sha512_ctx_mgr_init_sb_sse4
+
+ mbin_dispatch_init_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_sse, \
+ sha512_ctx_mgr_submit_avx, sha512_ctx_mgr_submit_avx2, \
+ sha512_ctx_mgr_submit_sb_sse4
+
+ mbin_dispatch_init_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_sse, \
+ sha512_ctx_mgr_flush_avx, sha512_ctx_mgr_flush_avx2, \
+ sha512_ctx_mgr_flush_sb_sse4
+%endif
+
+
+;;; func core, ver, snum
+slversion sha512_ctx_mgr_init, 00, 04, 0175
+slversion sha512_ctx_mgr_submit, 00, 04, 0176
+slversion sha512_ctx_mgr_flush, 00, 04, 0177
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c
new file mode 100644
index 000000000..e9b156a33
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c
@@ -0,0 +1,234 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha512_mb.h"
+#include "endian_helper.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA512 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define H0 0x6a09e667f3bcc908
+#define H1 0xbb67ae8584caa73b
+#define H2 0x3c6ef372fe94f82b
+#define H3 0xa54ff53a5f1d36f1
+#define H4 0x510e527fade682d1
+#define H5 0x9b05688c2b3e6c1f
+#define H6 0x1f83d9abfb41bd6b
+#define H7 0x5be0cd19137e2179
+
+void sha512_single(const uint8_t * data, uint64_t digest[]);
+
+void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA512_BLOCK_SIZE];
+
+ /* 128 bit lengths not needed as len is uint32_t, so use 64 bit length
+ * and pad the first 64 bits with zeros. */
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+ digest[4] = H4;
+ digest[5] = H5;
+ digest[6] = H6;
+ digest[7] = H7;
+
+ i = len;
+ /* Hash the complete blocks */
+ while (i >= SHA512_BLOCK_SIZE) {
+ sha512_single(input_data, digest);
+ input_data += SHA512_BLOCK_SIZE;
+ i -= SHA512_BLOCK_SIZE;
+ }
+
+ /* Copy remainder to a buffer to be padded */
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+
+ // Pad more than required here and overwrite with length
+ for (j = i; j < (2 * SHA512_BLOCK_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA512_BLOCK_SIZE - SHA512_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA512_BLOCK_SIZE;
+ else
+ i = SHA512_BLOCK_SIZE;
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) len * 8);
+
+ /* Hash the padded last block */
+ sha512_single(buf, digest);
+ if (i == 256)
+ sha512_single(buf + 128, digest);
+}
+
+/* From the FIPS, these are the same as for SHA256, but operating on 64 bit words
+ * instead of 32 bit.
+ */
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+
+/* Sigma functions have same form as SHA256 but
+ * - change the word size to 64bit
+ * - change the amount to rotate
+ */
+#define ror64(x, r) (((x)>>(r)) ^ ((x)<<(64-(r))))
+
+/* Technically, s0 should be S0 as these are "capital sigma" functions, and likewise the case
+ * of the S0 should be s0, but keep as-is to avoid confusion with the other reference functions.
+ */
+#define s0(a) (ror64(a,28) ^ ror64(a,34) ^ ror64(a,39))
+#define s1(e) (ror64(e,14) ^ ror64(e,18) ^ ror64(e,41))
+
+#define S0(w) (ror64(w,1) ^ ror64(w,8) ^ (w >> 7))
+#define S1(w) (ror64(w,19) ^ ror64(w,61) ^ (w >> 6))
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = to_be64(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+void sha512_single(const uint8_t * data, uint64_t digest[])
+{
+ /* Check these are all uint64_t */
+ uint64_t a, b, c, d, e, f, g, h, t1, t2;
+ uint64_t w[16];
+ uint64_t *ww = (uint64_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22);
+ step(1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f);
+ step(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df);
+ step(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c851482353b);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218);
+ step(45, d, e, f, g, h, a, b, c, 0xd69906245565a910);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b); // step 63
+ step(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c);
+ step(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207);
+ step(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e);
+ step(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178);
+ step(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba);
+ step(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6);
+ step(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae);
+ step(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b);
+ step(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84);
+ step(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493);
+ step(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc);
+ step(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c);
+ step(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6);
+ step(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a);
+ step(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec);
+ step(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817); // step 79
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c
new file mode 100644
index 000000000..6650b0106
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "sha512_mb.h"
+
+/*
+ * Function: sha512_sb_mgr_flush_sse4.
+ *
+ * Description: This is a dummy API. Nothing done here.
+ *
+ * Return: always NULL.
+ *
+ * */
+SHA512_JOB *sha512_sb_mgr_flush_sse4(SHA512_MB_JOB_MGR * state)
+{
+ return NULL;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c
new file mode 100644
index 000000000..69df5600d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c
@@ -0,0 +1,38 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+// For single buffer APIs, nothing to be done here.
+// This function is required, to comply with the usage of
+// multi-buffer APIs.
+void sha512_sb_mgr_init_sse4(SHA512_MB_JOB_MGR * state)
+{
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c
new file mode 100644
index 000000000..96e1a5ee4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c
@@ -0,0 +1,65 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "sha512_mb.h"
+
+/*
+ * Function: sha512_sb_mgr_submit_sse4
+ *
+ * Description: Wrapper API for update routine of single buffer sha512,
+ * to comply with multi-buffer API.
+ *
+ * This function will pick up message/digest and length
+ * information from the argument "job", then call into
+ * sha512_sse4(). Argument "state" is passed in, but not
+ * really used here.
+ *
+ * Note: message init and padding is done outside. This function
+ * expects a packed buffer.
+ *
+ * Argument: state - not really used.
+ * job - contained message, digest, message length information, etc.
+ *
+ * Return: SHA512_JOB pointer.
+ *
+ **/
+SHA512_JOB *sha512_sb_mgr_submit_sse4(SHA512_MB_JOB_MGR * state, SHA512_JOB * job)
+{
+ assert(job != NULL);
+
+ uint8_t *buff = job->buffer;
+ uint64_t *digest = job->result_digest, len = job->len;
+
+ sha512_sse4((const void *)buff, (void *)digest, len);
+
+ return job;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm
new file mode 100644
index 000000000..8b43bce5e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm
@@ -0,0 +1,396 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+; Virtual Registers
+%ifidn __OUTPUT_FORMAT__, win64
+ %define msg rcx ; ARG1
+ %define digest rdx ; ARG2
+ %define msglen r8 ; ARG3
+ %define T1 rsi
+ %define T2 rdi
+%else
+ %define msg rdi ; ARG1
+ %define digest rsi ; ARG2
+ %define msglen rdx ; ARG3
+ %define T1 rcx
+ %define T2 r8
+%endif
+%define a_64 r9
+%define b_64 r10
+%define c_64 r11
+%define d_64 r12
+%define e_64 r13
+%define f_64 r14
+%define g_64 r15
+%define h_64 rbx
+%define tmp0 rax
+
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+struc frame
+ .W: resq 80 ; Message Schedule
+ .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ .GPRSAVE: resq 7
+%else
+ .GPRSAVE: resq 5
+%endif
+endstruc
+
+; Useful QWORD "arrays" for simpler memory references
+%define MSG(i) msg + 8*(i) ; Input message (arg1)
+%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2)
+%define K_t(i) K512 + 8*(i) ; SHA Constants (static mem)
+%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame)
+%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
+; MSG, DIGEST, K_t, W_t are arrays
+; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+%macro RotateState 0
+ ; Rotate symbles a..h right
+ %xdefine %%TMP h_64
+ %xdefine h_64 g_64
+ %xdefine g_64 f_64
+ %xdefine f_64 e_64
+ %xdefine e_64 d_64
+ %xdefine d_64 c_64
+ %xdefine c_64 b_64
+ %xdefine b_64 a_64
+ %xdefine a_64 %%TMP
+%endmacro
+
+%macro SHA512_Round 1
+%assign %%t (%1)
+
+ ; Compute Round %%t
+ mov T1, f_64 ; T1 = f
+ mov tmp0, e_64 ; tmp = e
+ xor T1, g_64 ; T1 = f ^ g
+ ror tmp0, 23 ; 41 ; tmp = e ror 23
+ and T1, e_64 ; T1 = (f ^ g) & e
+ xor tmp0, e_64 ; tmp = (e ror 23) ^ e
+ xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler
+ ror tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4
+ xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov T2, a_64 ; T2 = a
+ add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h
+ ror tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov tmp0, a_64 ; tmp = a
+ xor T2, c_64 ; T2 = a ^ c
+ and tmp0, c_64 ; tmp = a & c
+ and T2, b_64 ; T2 = (a ^ c) & b
+ xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov tmp0, a_64 ; tmp = a
+ ror tmp0, 5 ; 39 ; tmp = a ror 5
+ xor tmp0, a_64 ; tmp = (a ror 5) ^ a
+ add d_64, T1 ; e(next_state) = d + T1
+ ror tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6
+ xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c)
+ ror tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+%endmacro
+
+%macro SHA512_2Sched_2Round_sse 1
+%assign %%t (%1)
+
+ ; Compute rounds %%t-2 and %%t-1
+ ; Compute message schedule QWORDS %%t and %%t+1
+
+ ; Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ ; scheduler.
+ ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+ ; They are then added to their respective SHA512 constants at
+ ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+ ; For brievity, the comments following vectored instructions only refer to
+ ; the first of a pair of QWORDS.
+ ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+ ; The computation of the message schedule and the rounds are tightly
+ ; stitched to take advantage of instruction-level parallelism.
+ ; For clarity, integer instructions (for the rounds calculation) are indented
+ ; by one tab. Vectored instructions (for the message scheduler) are indented
+ ; by two tabs.
+
+ mov T1, f_64
+ movdqa xmm2, [W_t(%%t-2)] ; XMM2 = W[t-2]
+ xor T1, g_64
+ and T1, e_64
+ movdqa xmm0, xmm2 ; XMM0 = W[t-2]
+ xor T1, g_64
+ add T1, [WK_2(%%t)]
+ movdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
+ mov tmp0, e_64
+ ror tmp0, 23 ; 41
+ movdqa xmm3, xmm5 ; XMM3 = W[t-15]
+ xor tmp0, e_64
+ ror tmp0, 4 ; 18
+ psrlq xmm0, 61 - 19 ; XMM0 = W[t-2] >> 42
+ xor tmp0, e_64
+ ror tmp0, 14 ; 14
+ psrlq xmm3, (8 - 7) ; XMM3 = W[t-15] >> 1
+ add T1, tmp0
+ add T1, h_64
+ pxor xmm0, xmm2 ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
+ mov T2, a_64
+ xor T2, c_64
+ pxor xmm3, xmm5 ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
+ and T2, b_64
+ mov tmp0, a_64
+ psrlq xmm0, 19 - 6 ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+ and tmp0, c_64
+ xor T2, tmp0
+ psrlq xmm3, (7 - 1) ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+ mov tmp0, a_64
+ ror tmp0, 5 ; 39
+ pxor xmm0, xmm2 ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+ xor tmp0, a_64
+ ror tmp0, 6 ; 34
+ pxor xmm3, xmm5 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+ xor tmp0, a_64
+ ror tmp0, 28 ; 28
+ psrlq xmm0, 6 ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+ add T2, tmp0
+ add d_64, T1
+ psrlq xmm3, 1 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+ lea h_64, [T1 + T2]
+ RotateState
+ movdqa xmm1, xmm2 ; XMM1 = W[t-2]
+ mov T1, f_64
+ xor T1, g_64
+ movdqa xmm4, xmm5 ; XMM4 = W[t-15]
+ and T1, e_64
+ xor T1, g_64
+ psllq xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
+ add T1, [WK_2(%%t+1)]
+ mov tmp0, e_64
+ psllq xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
+ ror tmp0, 23 ; 41
+ xor tmp0, e_64
+ pxor xmm1, xmm2 ; XMM1 = (W[t-2] << 42)^W[t-2]
+ ror tmp0, 4 ; 18
+ xor tmp0, e_64
+ pxor xmm4, xmm5 ; XMM4 = (W[t-15]<<7)^W[t-15]
+ ror tmp0, 14 ; 14
+ add T1, tmp0
+ psllq xmm1, (64 - 61) ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+ add T1, h_64
+ mov T2, a_64
+ psllq xmm4, (64 - 8) ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+ xor T2, c_64
+ and T2, b_64
+ pxor xmm0, xmm1 ; XMM0 = s1(W[t-2])
+ mov tmp0, a_64
+ and tmp0, c_64
+ movdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
+ xor T2, tmp0
+ pxor xmm3, xmm4 ; XMM3 = s0(W[t-15])
+ mov tmp0, a_64
+ paddq xmm0, xmm3 ; XMM0 = s1(W[t-2]) + s0(W[t-15])
+ ror tmp0, 5 ; 39
+ paddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+ xor tmp0, a_64
+ paddq xmm0, xmm1 ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+ ror tmp0, 6 ; 34
+ movdqa [W_t(%%t)], xmm0 ; Store scheduled qwords
+ xor tmp0, a_64
+ paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ ror tmp0, 28 ; 28
+ movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds
+ add T2, tmp0
+ add d_64, T1
+ lea h_64, [T1 + T2]
+ RotateState
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_sse4(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+; message blocks.
+; L is the message length in SHA512 blocks.
+mk_global sha512_sse4, function
+sha512_sse4:
+ endbranch
+ cmp msglen, 0
+ je .nowork
+
+ ; Allocate Stack Space
+ sub rsp, frame_size
+
+ ; Save GPRs
+ mov [rsp + frame.GPRSAVE + 8 * 0], rbx
+ mov [rsp + frame.GPRSAVE + 8 * 1], r12
+ mov [rsp + frame.GPRSAVE + 8 * 2], r13
+ mov [rsp + frame.GPRSAVE + 8 * 3], r14
+ mov [rsp + frame.GPRSAVE + 8 * 4], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + frame.GPRSAVE + 8 * 5], rsi
+ mov [rsp + frame.GPRSAVE + 8 * 6], rdi
+%endif
+
+.updateblock:
+
+ ; Load state variables
+ mov a_64, [DIGEST(0)]
+ mov b_64, [DIGEST(1)]
+ mov c_64, [DIGEST(2)]
+ mov d_64, [DIGEST(3)]
+ mov e_64, [DIGEST(4)]
+ mov f_64, [DIGEST(5)]
+ mov g_64, [DIGEST(6)]
+ mov h_64, [DIGEST(7)]
+
+ %assign t 0
+ %rep 80/2 + 1
+ ; (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ ; +1 iteration because the scheduler leads hashing by 1 iteration
+ %if t < 2
+ ; BSWAP 2 QWORDS
+ movdqa xmm1, [XMM_QWORD_BSWAP]
+ movdqu xmm0, [MSG(t)]
+ pshufb xmm0, xmm1 ; BSWAP
+ movdqa [W_t(t)], xmm0 ; Store Scheduled Pair
+ paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ movdqa [WK_2(t)], xmm0 ; Store into WK for rounds
+ %elif t < 16
+ ; BSWAP 2 QWORDS; Compute 2 Rounds
+ movdqu xmm0, [MSG(t)]
+ pshufb xmm0, xmm1 ; BSWAP
+ SHA512_Round t - 2 ; Round t-2
+ movdqa [W_t(t)], xmm0 ; Store Scheduled Pair
+ paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ SHA512_Round t - 1 ; Round t-1
+ movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
+ %elif t < 79
+ ; Schedule 2 QWORDS; Compute 2 Rounds
+ SHA512_2Sched_2Round_sse t
+ %else
+ ; Compute 2 Rounds
+ SHA512_Round t - 2
+ SHA512_Round t - 1
+ %endif
+ %assign t t+2
+ %endrep
+
+ ; Update digest
+ add [DIGEST(0)], a_64
+ add [DIGEST(1)], b_64
+ add [DIGEST(2)], c_64
+ add [DIGEST(3)], d_64
+ add [DIGEST(4)], e_64
+ add [DIGEST(5)], f_64
+ add [DIGEST(6)], g_64
+ add [DIGEST(7)], h_64
+
+ ; Advance to next message block
+ add msg, 16*8
+ dec msglen
+ jnz .updateblock
+
+ ; Restore GPRs
+ mov rbx, [rsp + frame.GPRSAVE + 8 * 0]
+ mov r12, [rsp + frame.GPRSAVE + 8 * 1]
+ mov r13, [rsp + frame.GPRSAVE + 8 * 2]
+ mov r14, [rsp + frame.GPRSAVE + 8 * 3]
+ mov r15, [rsp + frame.GPRSAVE + 8 * 4]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rsi, [rsp + frame.GPRSAVE + 8 * 5]
+ mov rdi, [rsp + frame.GPRSAVE + 8 * 6]
+%endif
+ ; Restore Stack Pointer
+ add rsp, frame_size
+
+.nowork:
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+
+section .data
+
+ALIGN 16
+
+; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+; K[t] used in SHA512 hashing
+K512:
+ dq 0x428a2f98d728ae22,0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538,0x59f111f1b605d019
+ dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242,0x12835b0145706fbe
+ dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235,0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ dq 0x983e5152ee66dfab,0xa831c66d2db43210
+ dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ dq 0x06ca6351e003826f,0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ dq 0x650a73548baf63de,0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6,0x92722c851482353b
+ dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+ dq 0xd192e819d6ef5218,0xd69906245565a910
+ dq 0xf40e35855771202a,0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc,0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ dq 0x90befffa23631e28,0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ dq 0xca273eceea26619c,0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae,0x1b710b35131c471b
+ dq 0x28db77f523047d84,0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am
new file mode 100644
index 000000000..8f8a3f4a6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/Makefile.am
@@ -0,0 +1,121 @@
+########################################################################
+# Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_x86_64 += sm3_mb/sm3_ctx_base.c \
+ sm3_mb/sm3_multibinary.asm
+
+lsrc_base_aliases += sm3_mb/sm3_ctx_base.c \
+ sm3_mb/sm3_ctx_base_aliases.c
+
+lsrc_aarch64 += sm3_mb/sm3_ctx_base.c \
+ sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c \
+ sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S \
+ sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c \
+ sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c \
+ sm3_mb/aarch64/sm3_mb_sm_x1.S \
+ sm3_mb/aarch64/sm3_mb_sm_x2.S \
+ sm3_mb/aarch64/sm3_mb_sm_x3.S \
+ sm3_mb/aarch64/sm3_mb_sm_x4.S \
+ sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c \
+ sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c \
+ sm3_mb/aarch64/sm3_mb_asimd_x1.S \
+ sm3_mb/aarch64/sm3_mb_asimd_x4.S
+
+
+src_include += -I $(srcdir)/sm3_mb
+
+extern_hdrs += include/sm3_mb.h \
+ include/multi_buffer.h
+
+lsrc_x86_64 += sm3_mb/sm3_ctx_avx512.c \
+ sm3_mb/sm3_mb_mgr_submit_avx512.asm \
+ sm3_mb/sm3_mb_mgr_flush_avx512.asm \
+ sm3_mb/sm3_mb_x16_avx512.asm
+
+lsrc_x86_64 += sm3_mb/sm3_ctx_avx2.c \
+ sm3_mb/sm3_mb_mgr_submit_avx2.asm \
+ sm3_mb/sm3_mb_mgr_flush_avx2.asm \
+ sm3_mb/sm3_mb_x8_avx2.asm
+
+other_src += include/datastruct.asm \
+ include/multibinary.asm \
+ include/reg_sizes.asm \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h \
+ sm3_mb/sm3_job.asm \
+ sm3_mb/sm3_mb_mgr_datastruct.asm \
+ sm3_mb/sm3_test_helper.c
+
+check_tests += sm3_mb/sm3_ref_test
+
+unit_tests += sm3_mb/sm3_mb_rand_ssl_test \
+ sm3_mb/sm3_mb_rand_test \
+ sm3_mb/sm3_mb_rand_update_test \
+ sm3_mb/sm3_mb_flush_test \
+ sm3_mb/sm3_mb_test
+
+perf_tests += sm3_mb/sm3_mb_vs_ossl_perf \
+ sm3_mb/sm3_mb_vs_ossl_shortage_perf
+
+sm3_mb_rand_ssl_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sm3_mb_rand_ssl_test: sm3_test_helper.o
+sm3_mb_sm3_mb_rand_ssl_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_rand_update_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_rand_update_test_LDFLAGS = -lcrypto
+
+sm3_mb_rand_update_test: sm3_test_helper.o
+sm3_mb_sm3_mb_rand_update_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_flush_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_flush_test_LDFLAGS = -lcrypto
+
+sm3_mb_flush_test: sm3_test_helper.o
+sm3_mb_sm3_mb_flush_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_rand_test: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_rand_test_LDFLAGS = -lcrypto
+
+sm3_mb_rand_test: sm3_test_helper.o
+sm3_mb_sm3_mb_rand_test_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
+sm3_mb_vs_ossl_perf: sm3_test_helper.o
+sm3_mb_sm3_mb_vs_ossl_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
+
+sm3_mb_vs_ossl_shortage_perf: LDLIBS += -lcrypto
+sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDFLAGS = -lcrypto
+
+sm3_mb_vs_ossl_shortage_perf: sm3_test_helper.o
+sm3_mb_sm3_mb_vs_ossl_shortage_perf_LDADD = sm3_mb/sm3_test_helper.lo libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c
new file mode 100644
index 000000000..208a7414e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_aarch64_dispatcher.c
@@ -0,0 +1,65 @@
+/**********************************************************************
+ Copyright(c) 2019-2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <aarch64_multibinary.h>
+
+DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_submit)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SM3)
+ return PROVIDER_INFO(sm3_ctx_mgr_submit_sm);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(sm3_ctx_mgr_submit_asimd);
+
+ return PROVIDER_BASIC(sm3_ctx_mgr_submit);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_init)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SM3)
+ return PROVIDER_INFO(sm3_ctx_mgr_init_sm);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(sm3_ctx_mgr_init_asimd);
+
+ return PROVIDER_BASIC(sm3_ctx_mgr_init);
+
+}
+
+DEFINE_INTERFACE_DISPATCHER(sm3_ctx_mgr_flush)
+{
+ unsigned long auxval = getauxval(AT_HWCAP);
+ if (auxval & HWCAP_SM3)
+ return PROVIDER_INFO(sm3_ctx_mgr_flush_sm);
+ if (auxval & HWCAP_ASIMD)
+ return PROVIDER_INFO(sm3_ctx_mgr_flush_asimd);
+
+ return PROVIDER_BASIC(sm3_ctx_mgr_flush);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S
new file mode 100644
index 000000000..c7362de90
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x1.S
@@ -0,0 +1,387 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
+ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+ job .req x0
+ len .req x1
+ data .req x2
+ digest .req x0
+
+ msg0 .req w3
+ msg1 .req w4
+ msg2 .req w5
+ msg3 .req w6
+ msg4 .req w7
+
+ msg .req w9
+ msgP .req w10
+ SS1 .req w11
+ SS2 .req w12
+ TT1 .req w13
+ TT2 .req w14
+ Tj .req w15
+ tmp0 .req w19
+ tmp1 .req w20
+ dig_A .req w21
+ dig_B .req w22
+ dig_C .req w23
+ dig_D .req w24
+ dig_E .req w25
+ dig_F .req w26
+ dig_G .req w27
+ dig_H .req w28
+
+ declare_var_vector_reg dig0,0
+ declare_var_vector_reg dig1,1
+ declare_var_vector_reg dig0_bak,2
+ declare_var_vector_reg dig1_bak,3
+ declare_var_vector_reg vect_msg0,4
+ declare_var_vector_reg vect_msg1,5
+ declare_var_vector_reg vect_msg2,6
+ declare_var_vector_reg vect_msg3,7
+
+ declare_var_vector_reg vect_msgP0,16
+ declare_var_vector_reg vect_msgP1,17
+ declare_var_vector_reg vect_msgP2,18
+
+
+
+
+
+
+// round 0-11
+.macro sm3_round_0 round:req
+ ldr msg, [sp,msg_off+4*\round\()]
+ ldr msgP,[sp,wp_off +4*\round\()]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+ eor TT1,dig_A,dig_B
+ eor TT2,dig_E,dig_F
+ add SS2,SS2,msgP
+ eor TT2,TT2,dig_G
+ add SS1,SS1,msg
+ eor TT1,TT1,dig_C
+ add SS2,SS2,dig_D
+ add SS1,SS1,dig_H
+ add TT1,TT1,SS2
+ add TT2,TT2,SS1
+ mov dig_D,dig_C
+ ror dig_C,dig_B,32-9
+ mov dig_B,dig_A
+ mov dig_A,TT1
+ eor TT1,TT2,TT2,ror (32-17)
+ mov dig_H,dig_G
+ ror dig_G,dig_F,32-19
+ mov dig_F,dig_E
+ eor dig_E,TT1,TT2,ror(32-9)
+ ror Tj,Tj,(32-1)
+.endm
+
+//round 12-15
+.macro sm3_round_12 round:req
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+
+ eor msg0,msg0,msg1
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+ eor TT1,dig_A,dig_B
+ eor TT2,dig_E,dig_F
+ add SS2,SS2,dig_D
+ eor TT2,TT2,dig_G
+ add SS1,SS1,msg
+ eor msg0,msg0,msg2,ror (32-15)
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
+ eor msg1,msg0,msg0,ror (32 -15)
+ eor TT1,TT1,dig_C
+ add TT1,TT1,SS2
+ eor msg4,msg4,msg3, ror (32-7)
+ eor msg0,msg1,msg0, ror (32-23)
+ add SS1,SS1,dig_H
+ eor msg0,msg0,msg4
+ add TT2,TT2,SS1
+ mov dig_D,dig_C
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
+ eor msgP,msg,msg0
+ add TT1,TT1,msgP
+ ror dig_C,dig_B,32-9
+ mov dig_B,dig_A
+ mov dig_A,TT1
+ eor TT1,TT2,TT2,ror (32-17)
+ mov dig_H,dig_G
+ ror dig_G,dig_F,32-19
+ mov dig_F,dig_E
+ eor dig_E,TT1,TT2,ror(32-9)
+ ror Tj,Tj,32-1
+.endm
+
+// round 16-62
+.macro sm3_round_16 round:req
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+
+ eor msg0,msg0,msg1
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+ orr TT1,dig_B,dig_C
+ and tmp0,dig_B,dig_C
+
+ eor TT2,dig_F,dig_G
+ and TT1,TT1,dig_A
+ add SS2,SS2,dig_D
+ orr TT1,TT1,tmp0
+ and TT2,TT2,dig_E
+ add SS1,SS1,msg
+ eor TT2,TT2,dig_G
+
+ eor msg0,msg0,msg2,ror (32-15)
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
+ eor msg1,msg0,msg0,ror (32 -15)
+ add TT1,TT1,SS2
+ eor msg4,msg4,msg3, ror (32-7)
+ eor msg0,msg1,msg0, ror (32-23)
+ add SS1,SS1,dig_H
+ eor msg0,msg0,msg4
+ add TT2,TT2,SS1
+ mov dig_D,dig_C
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
+ eor msgP,msg,msg0
+ add TT1,TT1,msgP
+ ror dig_C,dig_B,32-9
+ mov dig_B,dig_A
+ mov dig_A,TT1
+ eor TT1,TT2,TT2,ror (32-17)
+ mov dig_H,dig_G
+ ror dig_G,dig_F,32-19
+ mov dig_F,dig_E
+ eor dig_E,TT1,TT2,ror(32-9)
+ ror Tj,Tj,32-1
+.endm
+
+//round 63
+.macro sm3_round_63 round:req
+ ldr msg, [sp,msg_off+4*((\round\())%17)]
+ ldr msg0,[sp,msg_off+4*((\round\()+4 - 16)%17)]
+ ldr msg1,[sp,msg_off+4*((\round\()+4 - 9)%17)]
+ add SS1,dig_E,Tj
+ ror TT1,dig_A,32-12
+ add SS1,SS1,TT1
+ ror SS1,SS1,32-7 //SS1 done
+ eor SS2,SS1,TT1 //SS2 done
+ eor msg0,msg0,msg1
+ ldr msg2,[sp,msg_off+4*((\round\()+4 - 3)%17)]
+ orr TT1,dig_B,dig_C
+ and tmp0,dig_B,dig_C
+ eor TT2,dig_F,dig_G
+ and TT1,TT1,dig_A
+ add SS2,SS2,dig_D
+ orr TT1,TT1,tmp0
+ and TT2,TT2,dig_E
+ add SS1,SS1,msg
+ eor TT2,TT2,dig_G
+ eor msg0,msg0,msg2,ror (32-15)
+ ldr msg3,[sp,msg_off+4*((\round\()+4 - 13)%17)]
+ ldr msg4,[sp,msg_off+4*((\round\()+4 - 6)%17)]
+ eor msg1,msg0,msg0,ror (32 -15)
+ add TT1,TT1,SS2
+ eor msg4,msg4,msg3, ror (32-7)
+ eor msg0,msg1,msg0, ror (32-23)
+ add SS1,SS1,dig_H
+ eor msg0,msg0,msg4
+ add TT2,TT2,SS1
+ str msg0,[sp,msg_off+4*((\round\()+4)%17)]
+ eor msgP,msg,msg0
+ add TT1,TT1,msgP
+ ins vdig0_bak.s[3],dig_C
+ ror dig_C,dig_B,32-9
+ ins vdig0_bak.s[1],dig_A
+ ins vdig0_bak.s[0],TT1
+ ins vdig0_bak.s[2],dig_C
+ eor TT1,TT2,TT2,ror (32-17)
+ ins vdig1_bak.s[3],dig_G
+ ror dig_G,dig_F,32-19
+ ins vdig1_bak.s[1],dig_E
+ ins vdig1_bak.s[2],dig_G
+ eor dig_E,TT1,TT2,ror(32-9)
+ ins vdig1_bak.s[0],dig_E
+.endm
+
+ .set wp_off , 96
+ .set msg_off, 96 + 12*4
+#define STACK_SIZE 224
+ .global sm3_mb_asimd_x1
+ .type sm3_mb_asimd_x1, %function
+sm3_mb_asimd_x1:
+ stp x29,x30, [sp,-STACK_SIZE]!
+ cmp len,0
+ ldr data,[job],64
+ ldp qdig0,qdig1,[digest]
+ stp x19, x20, [sp, 16]
+ stp x21, x22, [sp, 32]
+ rev32 vdig0.16b,vdig0.16b
+ stp x23, x24, [sp, 48]
+ rev32 vdig1.16b,vdig1.16b
+ stp x25, x26, [sp, 64]
+ stp x27, x28, [sp, 80]
+ ble .exit_func
+
+.start_loop:
+
+ /** prepare first 12 round data **/
+ ld1 {vvect_msg0.16b-vvect_msg3.16b},[data],64
+ mov Tj, 17689
+ umov dig_A,vdig0.s[0]
+ movk Tj, 0x79cc, lsl 16
+ rev32 vvect_msg0.16b,vvect_msg0.16b
+ umov dig_B,vdig0.s[1]
+ rev32 vvect_msg1.16b,vvect_msg1.16b
+ umov dig_C,vdig0.s[2]
+ rev32 vvect_msg2.16b,vvect_msg2.16b
+ umov dig_D,vdig0.s[3]
+ rev32 vvect_msg3.16b,vvect_msg3.16b
+ umov dig_E,vdig1.s[0]
+ stp qvect_msg0,qvect_msg1,[sp,msg_off]
+ umov dig_F,vdig1.s[1]
+ stp qvect_msg2,qvect_msg3,[sp,msg_off+32]
+ umov dig_G,vdig1.s[2]
+ eor vvect_msgP0.16b,vvect_msg0.16b,vvect_msg1.16b
+ eor vvect_msgP1.16b,vvect_msg1.16b,vvect_msg2.16b
+ umov dig_H,vdig1.s[3]
+ stp qvect_msgP0,qvect_msgP1,[sp,wp_off]
+ eor vvect_msgP2.16b,vvect_msg2.16b,vvect_msg3.16b
+ str qvect_msgP2,[sp,wp_off+32]
+
+ sm3_round_0 0
+ sm3_round_0 1
+ sm3_round_0 2
+ sm3_round_0 3
+ sm3_round_0 4
+ sm3_round_0 5
+ sm3_round_0 6
+ sm3_round_0 7
+ sm3_round_0 8
+ sm3_round_0 9
+ sm3_round_0 10
+ sm3_round_0 11
+
+ sm3_round_12 12
+ sm3_round_12 13
+ sm3_round_12 14
+ sm3_round_12 15
+ mov Tj, 0x7a87
+ movk Tj, 0x9d8a, lsl 16
+ sm3_round_16 16
+ sm3_round_16 17
+ sm3_round_16 18
+ sm3_round_16 19
+ sm3_round_16 20
+ sm3_round_16 21
+ sm3_round_16 22
+ sm3_round_16 23
+ sm3_round_16 24
+ sm3_round_16 25
+ sm3_round_16 26
+ sm3_round_16 27
+ sm3_round_16 28
+ sm3_round_16 29
+ sm3_round_16 30
+ sm3_round_16 31
+ sm3_round_16 32
+ sm3_round_16 33
+ sm3_round_16 34
+ sm3_round_16 35
+ sm3_round_16 36
+ sm3_round_16 37
+ sm3_round_16 38
+ sm3_round_16 39
+ sm3_round_16 40
+ sm3_round_16 41
+ sm3_round_16 42
+ sm3_round_16 43
+ sm3_round_16 44
+ sm3_round_16 45
+ sm3_round_16 46
+ sm3_round_16 47
+ sm3_round_16 48
+ sm3_round_16 49
+ sm3_round_16 50
+ sm3_round_16 51
+ sm3_round_16 52
+ sm3_round_16 53
+ sm3_round_16 54
+ sm3_round_16 55
+ sm3_round_16 56
+ sm3_round_16 57
+ sm3_round_16 58
+ sm3_round_16 59
+ sm3_round_16 60
+ sm3_round_16 61
+ sm3_round_16 62
+ sm3_round_63 63
+ subs len,len,1
+ eor vdig0.16b,vdig0.16b,vdig0_bak.16b
+ eor vdig1.16b,vdig1.16b,vdig1_bak.16b
+ bne .start_loop
+.exit_func:
+ ldp x19, x20, [sp, 16]
+ rev32 vdig0.16b,vdig0.16b
+ ldp x21, x22, [sp, 32]
+ rev32 vdig1.16b,vdig1.16b
+ ldp x23, x24, [sp, 48]
+ stp qdig0,qdig1,[digest]
+ ldp x25, x26, [sp, 64]
+ ldp x27, x28, [sp, 80]
+ ldp x29, x30, [sp], STACK_SIZE
+ ret
+ .size sm3_mb_asimd_x1, .-sm3_mb_asimd_x1
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S
new file mode 100644
index 000000000..975a07c7a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_asimd_x4.S
@@ -0,0 +1,576 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTmsgARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED msgARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED msgARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ dig_A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OmsgNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOmsgEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, msgHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERmsgISE) ARISING IN ANY msgAY OUT OF THE USE
+ OF THIS SOFTmsgARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ job3 .req x3
+ len .req x4
+
+ job0_data .req x5
+ job1_data .req x6
+ job2_data .req x7
+ job3_data .req x9
+
+ job0_digest .req x0
+ job1_digest .req x1
+ job2_digest .req x2
+ job3_digest .req x3
+ job0_tmp .req x10
+ job1_tmp .req x11
+ job2_tmp .req x12
+ job3_tmp .req x13
+ const_adr .req x14
+
+
+ declare_var_vector_reg msg0,0
+ declare_var_vector_reg msg1,1
+ declare_var_vector_reg msg2,2
+ declare_var_vector_reg msg3,3
+ declare_var_vector_reg msg4,4
+ declare_var_vector_reg msg5,5
+ declare_var_vector_reg msg6,6
+ declare_var_vector_reg msg7,7
+ declare_var_vector_reg msg8,8
+ declare_var_vector_reg msg9,9
+ declare_var_vector_reg msg10,10
+ declare_var_vector_reg msg11,11
+ declare_var_vector_reg msg12,12
+ declare_var_vector_reg msg13,13
+ declare_var_vector_reg msg14,14
+ declare_var_vector_reg msg15,15
+ declare_var_vector_reg msg16,16
+
+
+ declare_var_vector_reg dig_A,24
+ declare_var_vector_reg dig_B,25
+ declare_var_vector_reg dig_C,26
+ declare_var_vector_reg dig_D,27
+ declare_var_vector_reg dig_E,28
+ declare_var_vector_reg dig_F,29
+ declare_var_vector_reg dig_G,30
+ declare_var_vector_reg dig_H,31
+
+ declare_var_vector_reg TT1,17
+ declare_var_vector_reg TT2,18
+ declare_var_vector_reg SS1,19
+ declare_var_vector_reg SS2,20
+ declare_var_vector_reg tmp0,21
+ declare_var_vector_reg word_pair,23
+ declare_var_vector_reg Tj,22
+
+
+.macro rol32 target:req,reg:req,bit:req
+ ushr v\target\().4s,v\reg\().4s,32 - \bit
+ sli v\target\().4s,v\reg\().4s,\bit
+.endm
+
+// round 0-11
+.macro sm3_round_0 round:req,wp:req
+
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ sli vtmp0.4s,vdig_A.4s,12
+ rev32 vmsg\round\().16b,vmsg\round\().16b
+ rev32 vmsg\wp\().16b,vmsg\wp\().16b
+ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,TT1,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
+
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
+
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+
+.endm
+
+
+.macro sm3_round_4 round:req,wp:req
+
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ sli vtmp0.4s,vdig_A.4s,12
+ rev32 vmsg\wp\().16b,vmsg\wp\().16b
+ add vTT1.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,TT1,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\wp\().16b
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+
+.endm
+
+//round 12-15
+.macro sm3_round_12 round:req,plus_4:req,m0,m1,m2,m3,m4
+ rol32 msg\plus_4,msg\m2,15
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+ rol32 tmp0,msg\plus_4,15
+ rol32 word_pair,msg\plus_4,23
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+ rol32 tmp0,msg\m3,7
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ sli vtmp0.4s,vdig_A.4s,12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,SS2,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+ eor vTT1.16b,vdig_A.16b,vdig_B.16b
+ eor vTT1.16b,vTT1.16b,vdig_C.16b
+ eor vTT2.16b,vdig_E.16b,vdig_F.16b
+ eor vTT2.16b,vTT2.16b,vdig_G.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+.endm
+
+// round 16-62
+.macro sm3_round_16 round:req,plus_4:req,m0,m1,m2,m3,m4
+ rol32 msg\plus_4,msg\m2,15
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+ rol32 tmp0,msg\plus_4,15
+ rol32 word_pair,msg\plus_4,23
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+ rol32 tmp0,msg\m3,7
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ sli vtmp0.4s,vdig_A.4s,12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,SS2,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+ mov vTT2.16b,vdig_E.16b
+ orr vTT1.16b,vdig_B.16b,vdig_C.16b
+ and vtmp0.16b,vdig_B.16b,vdig_C.16b
+ bsl vTT2.16b,vdig_F.16b,vdig_G.16b
+ and vTT1.16b,vTT1.16b,vdig_A.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ orr vTT1.16b,vTT1.16b,vtmp0.16b
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ ushr vtmp0.4s,vTj.4s,32-1
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ sli vtmp0.4s,vTj.4s,1
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ mov vTj.16b,vtmp0.16b
+ //D=C
+ mov vdig_D.16b,vdig_C.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ //B=A
+ mov vdig_B.16b,vdig_A.16b
+ //A=TT1
+ mov vdig_A.16b,vTT1.16b
+ // H=G
+ mov vdig_H.16b,vdig_G.16b
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ //F = E
+ mov vdig_F.16b,vdig_E.16b
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+.endm
+
+//round 63
+.macro sm3_round_63 round:req,plus_4:req,m0,m1,m2,m3,m4
+ rol32 msg\plus_4,msg\m2,15
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m0\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m1\().16b
+ rol32 tmp0,msg\plus_4,15
+ rol32 word_pair,msg\plus_4,23
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vword_pair.16b
+ rol32 tmp0,msg\m3,7
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vmsg\m4\().16b
+ eor vmsg\plus_4\().16b,vmsg\plus_4\().16b,vtmp0.16b
+ ushr vtmp0.4s,vdig_A.4s,32 - 12
+ sli vtmp0.4s,vdig_A.4s,12
+ add vSS1.4s,vdig_E.4s,vTj.4s
+ add vSS2.4s,vSS1.4s,vtmp0.4s //SS1 Done
+ rol32 SS1,SS2,7
+ eor vSS2.16b,vSS1.16b,vtmp0.16b //SS2 Done
+ eor vword_pair.16b,vmsg\round\().16b,vmsg\plus_4\().16b
+
+ ldp qmsg0,qmsg1,[sp,dig_off+ 0]
+ mov vTT2.16b,vdig_E.16b
+ ldp qmsg2,qmsg3,[sp,dig_off+ 32]
+ orr vTT1.16b,vdig_B.16b,vdig_C.16b
+ ldp qmsg4,qmsg5,[sp,dig_off+ 64]
+ and vtmp0.16b,vdig_B.16b,vdig_C.16b
+ bsl vTT2.16b,vdig_F.16b,vdig_G.16b
+ ldp qmsg6,qmsg7,[sp,dig_off+ 96]
+ and vTT1.16b,vTT1.16b,vdig_A.16b
+ add vSS1.4s,vSS1.4s,vmsg\round\().4s
+ orr vTT1.16b,vTT1.16b,vtmp0.16b
+ add vSS2.4s,vSS2.4s,vword_pair.4s
+ add vTT1.4s,vTT1.4s,vdig_D.4s
+ add vTT2.4s,vTT2.4s,vdig_H.4s
+ add vTT1.4s,vTT1.4s,vSS2.4s //TT1 Done
+ add vTT2.4s,vTT2.4s,vSS1.4s //TT2 Done
+ //D=C
+ eor vdig_D.16b,vdig_C.16b,vmsg3.16b
+ //C = ROTL32(B, 9);
+ ushr vdig_C.4s,vdig_B.4s,32 - 9
+ sli vdig_C.4s,vdig_B.4s,9
+ eor vdig_C.16b,vdig_C.16b,vmsg2.16b
+ //B=A
+ eor vdig_B.16b,vdig_A.16b,vmsg1.16b
+ stp qdig_C,qdig_D,[sp,dig_off+ 32]
+ //A=TT1
+ eor vdig_A.16b,vTT1.16b,vmsg0.16b
+ // H=G
+ eor vdig_H.16b,vdig_G.16b,vmsg7.16b
+ stp qdig_A,qdig_B,[sp,dig_off+ 0]
+ //G = ROTL32(F,19)
+ rol32 dig_G,dig_F,19
+ eor vdig_G.16b,vdig_G.16b,vmsg6.16b
+ //F = E
+ eor vdig_F.16b,vdig_E.16b,vmsg5.16b
+ stp qdig_G,qdig_H,[sp,dig_off+ 96]
+ // E=Target, TT2=src, TT1,SS1,SS2 is free
+ // E = P0(TT2);
+ ushr vSS2.4s, vTT2.4s, 32 - 9
+ ushr vSS1.4s, vTT2.4s, 32 - 17
+ sli vSS2.4s, vTT2.4s, 9
+ sli vSS1.4s, vTT2.4s, 17
+ eor vdig_E.16b, vTT2.16b, vSS1.16b
+ eor vdig_E.16b, vdig_E.16b, vSS2.16b
+ eor vdig_E.16b, vdig_E.16b, vmsg4.16b
+ stp qdig_E,qdig_F,[sp,dig_off+ 64]
+.endm
+
+ .set dig_off , 80
+
+#define STACK_SIZE 224
+ .global sm3_mb_asimd_x4
+ .type sm3_mb_asimd_x4, %function
+sm3_mb_asimd_x4:
+ stp x29,x30, [sp,-STACK_SIZE]!
+ cmp len,0
+ //push d8~d15
+ ldr job0_data, [job0],64
+ stp d8,d9, [sp,16]
+ ldr job1_data, [job1],64
+ stp d10,d11,[sp,32]
+ ldr job2_data, [job2],64
+ stp d12,d13,[sp,48]
+ ldr job3_data, [job3],64
+ stp d14,d15,[sp,64]
+ ble .exit_func
+
+ mov job0_tmp,job0_digest
+ mov job1_tmp,job1_digest
+ mov job2_tmp,job2_digest
+ mov job3_tmp,job3_digest
+ //load digests
+ ld4 {vdig_A.s-vdig_D.s}[0],[job0_tmp],16
+ ld4 {vdig_A.s-vdig_D.s}[1],[job1_tmp],16
+ ld4 {vdig_A.s-vdig_D.s}[2],[job2_tmp],16
+ adrp const_adr, .consts
+ ld4 {vdig_A.s-vdig_D.s}[3],[job3_tmp],16
+ add const_adr, const_adr, #:lo12:.consts
+ ld4 {vdig_E.s-vdig_H.s}[0],[job0_tmp]
+ rev32 vdig_A.16b,vdig_A.16b
+ ld4 {vdig_E.s-vdig_H.s}[1],[job1_tmp]
+ rev32 vdig_B.16b,vdig_B.16b
+ ld4 {vdig_E.s-vdig_H.s}[2],[job2_tmp]
+ rev32 vdig_C.16b,vdig_C.16b
+ ld4 {vdig_E.s-vdig_H.s}[3],[job3_tmp]
+ rev32 vdig_D.16b,vdig_D.16b
+ stp qdig_A,qdig_B,[sp,dig_off+ 0]
+ rev32 vdig_E.16b,vdig_E.16b
+ rev32 vdig_F.16b,vdig_F.16b
+ stp qdig_C,qdig_D,[sp,dig_off+ 32]
+ rev32 vdig_G.16b,vdig_G.16b
+ rev32 vdig_H.16b,vdig_H.16b
+ stp qdig_E,qdig_F,[sp,dig_off+ 64]
+ stp qdig_G,qdig_H,[sp,dig_off+ 96]
+
+.start_loop:
+ ld4 {vmsg0.s-vmsg3.s}[0],[job0_data],16
+ ld4 {vmsg0.s-vmsg3.s}[1],[job1_data],16
+ ld4 {vmsg0.s-vmsg3.s}[2],[job2_data],16
+ ld4 {vmsg0.s-vmsg3.s}[3],[job3_data],16
+ ld4 {vmsg4.s-vmsg7.s}[0],[job0_data],16
+ ld4 {vmsg4.s-vmsg7.s}[1],[job1_data],16
+ ld4 {vmsg4.s-vmsg7.s}[2],[job2_data],16
+ ld4 {vmsg4.s-vmsg7.s}[3],[job3_data],16
+ ld4 {vmsg8.s-vmsg11.16b}[0],[job0_data],16
+ ldr qTj,[const_adr]
+
+ sm3_round_0 0, 4
+
+ ld4 {vmsg8.s-vmsg11.s}[1],[job1_data],16
+ sm3_round_0 1, 5
+
+ ld4 {vmsg8.s-vmsg11.s}[2],[job2_data],16
+ sm3_round_0 2, 6
+ ld4 {vmsg8.s-vmsg11.s}[3],[job3_data],16
+ sm3_round_0 3, 7
+
+ ld4 {vmsg12.s-vmsg15.s}[0],[job0_data],16
+
+ sm3_round_4 4, 8
+ ld4 {vmsg12.s-vmsg15.s}[1],[job1_data],16
+ sm3_round_4 5, 9
+ ld4 {vmsg12.s-vmsg15.s}[2],[job2_data],16
+ sm3_round_4 6,10
+ ld4 {vmsg12.s-vmsg15.s}[3],[job3_data],16
+ sm3_round_4 7,11
+ sm3_round_4 8,12
+ sm3_round_4 9,13
+ sm3_round_4 10,14
+ sm3_round_4 11,15
+
+ sm3_round_12 12,16, 0, 7,13, 3,10 //12
+ sm3_round_12 13, 0, 1, 8,14, 4,11 //13
+ sm3_round_12 14, 1, 2, 9,15, 5,12 //14
+ sm3_round_12 15, 2, 3,10,16, 6,13 //15
+
+ ldr qTj,[const_adr,16]
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //16
+#if 0
+ stp sdig_A,sdig_B,[job0_digest]
+ stp sdig_C,sdig_D,[job0_digest,8]
+ stp sdig_E,sdig_F,[job0_digest,16]
+ stp sdig_G,sdig_H,[job0_digest,24]
+ b .exit_func
+#endif
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //17
+
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //18
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //19
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //20
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //21
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //22
+ sm3_round_16 6,10,11, 1, 7,14, 4 //23
+ sm3_round_16 7,11,12, 2, 8,15, 5 //24
+ sm3_round_16 8,12,13, 3, 9,16, 6 //25
+ sm3_round_16 9,13,14, 4,10, 0, 7 //26
+ sm3_round_16 10,14,15, 5,11, 1, 8 //27
+ sm3_round_16 11,15,16, 6,12, 2, 9 //28
+ sm3_round_16 12,16, 0, 7,13, 3,10 //29
+ sm3_round_16 13, 0, 1, 8,14, 4,11 //30
+ sm3_round_16 14, 1, 2, 9,15, 5,12 //31
+ sm3_round_16 15, 2, 3,10,16, 6,13 //32
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //33
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //34
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //35
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //36
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //37
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //38
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //39
+ sm3_round_16 6,10,11, 1, 7,14, 4 //40
+ sm3_round_16 7,11,12, 2, 8,15, 5 //41
+ sm3_round_16 8,12,13, 3, 9,16, 6 //42
+ sm3_round_16 9,13,14, 4,10, 0, 7 //43
+ sm3_round_16 10,14,15, 5,11, 1, 8 //44
+ sm3_round_16 11,15,16, 6,12, 2, 9 //45
+ sm3_round_16 12,16, 0, 7,13, 3,10 //46
+ sm3_round_16 13, 0, 1, 8,14, 4,11 //47
+ sm3_round_16 14, 1, 2, 9,15, 5,12 //48
+ sm3_round_16 15, 2, 3,10,16, 6,13 //49
+ sm3_round_16 16, 3, 4,11, 0, 7,14 //50
+ sm3_round_16 0, 4, 5,12, 1, 8,15 //51
+ sm3_round_16 1, 5, 6,13, 2, 9,16 //52
+ sm3_round_16 2, 6, 7,14, 3,10, 0 //53
+ sm3_round_16 3, 7, 8,15, 4,11, 1 //54
+ sm3_round_16 4, 8, 9,16, 5,12, 2 //55
+ sm3_round_16 5, 9,10, 0, 6,13, 3 //56
+ sm3_round_16 6,10,11, 1, 7,14, 4 //57
+ sm3_round_16 7,11,12, 2, 8,15, 5 //58
+ sm3_round_16 8,12,13, 3, 9,16, 6 //59
+ sm3_round_16 9,13,14, 4,10, 0, 7 //60
+ sm3_round_16 10,14,15, 5,11, 1, 8 //61
+ sm3_round_16 11,15,16, 6,12, 2, 9 //62
+ sm3_round_63 12,16, 0, 7,13, 3,10 //63
+
+ subs len,len,1
+ bne .start_loop
+
+ //save digests with big endian
+ rev32 vdig_A.16b,vdig_A.16b
+ rev32 vdig_B.16b,vdig_B.16b
+ rev32 vdig_C.16b,vdig_C.16b
+ rev32 vdig_D.16b,vdig_D.16b
+ st4 {vdig_A.s-vdig_D.s}[0],[job0_digest],16
+ rev32 vdig_E.16b,vdig_E.16b
+ rev32 vdig_F.16b,vdig_F.16b
+ st4 {vdig_A.s-vdig_D.s}[1],[job1_digest],16
+ rev32 vdig_G.16b,vdig_G.16b
+ rev32 vdig_H.16b,vdig_H.16b
+ st4 {vdig_A.s-vdig_D.s}[2],[job2_digest],16
+ st4 {vdig_A.s-vdig_D.s}[3],[job3_digest],16
+ st4 {vdig_E.s-vdig_H.s}[0],[job0_digest]
+ st4 {vdig_E.s-vdig_H.s}[1],[job1_digest]
+ st4 {vdig_E.s-vdig_H.s}[2],[job2_digest]
+ st4 {vdig_E.s-vdig_H.s}[3],[job3_digest]
+
+.exit_func:
+ ldp d8, d9, [sp,16]
+ ldp d10,d11,[sp,32]
+ ldp d12,d13,[sp,48]
+ ldp d14,d15,[sp,64]
+ ldp x29, x30, [sp], STACK_SIZE
+ ret
+.consts:
+ .word 0x79cc4519
+ .word 0x79cc4519
+ .word 0x79cc4519
+ .word 0x79cc4519
+ .word 0x9d8a7a87
+ .word 0x9d8a7a87
+ .word 0x9d8a7a87
+ .word 0x9d8a7a87
+ .size sm3_mb_asimd_x4, .-sm3_mb_asimd_x4
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c
new file mode 100644
index 000000000..6e1dff45e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_asimd_aarch64.c
@@ -0,0 +1,246 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+#define SM3_LOG2_BLOCK_SIZE 6
+void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state);
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_ctx_mgr_init_asimd(SM3_HASH_CTX_MGR * mgr)
+{
+ sm3_mb_mgr_init_asimd(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_asimd(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_asimd(SM3_HASH_CTX_MGR * mgr)
+{
+ SM3_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_asimd(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SM3_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SM3_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_asimd(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define cpu_to_be32(v) (((v&0xff000000)>>24) | ((v&0xff0000)>>8) | ((v&0xff00)<<8) | ((v&0xff)<<24))
+#else
+#define cpu_to_be32(v)
+#endif
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { cpu_to_be32(0x7380166f), cpu_to_be32(0x4914b2b9),
+ cpu_to_be32(0x172442d7), cpu_to_be32(0xda8a0600),
+ cpu_to_be32(0xa96f30bc), cpu_to_be32(0x163138aa),
+ cpu_to_be32(0xe38dee4d), cpu_to_be32(0xb0fb0e4e)
+ };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c
new file mode 100644
index 000000000..5af9ead38
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_ctx_sm_aarch64.c
@@ -0,0 +1,241 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+#define SM3_LOG2_BLOCK_SIZE 6
+void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state);
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_ctx_mgr_init_sm(SM3_HASH_CTX_MGR * mgr)
+{
+ sm3_mb_mgr_init_sm(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_sm(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_sm(SM3_HASH_CTX_MGR * mgr)
+{
+ SM3_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_sm(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SM3_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SM3_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_sm(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { to_be32(0x7380166f), to_be32(0x4914b2b9),
+ to_be32(0x172442d7), to_be32(0xda8a0600),
+ to_be32(0xa96f30bc), to_be32(0x163138aa),
+ to_be32(0xe38dee4d), to_be32(0xb0fb0e4e)
+ };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c
new file mode 100644
index 000000000..48a0d4d0e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_asimd_aarch64.c
@@ -0,0 +1,188 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sm3_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SM3_MB_CE_MAX_LANES 4
+void sm3_mb_asimd_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int);
+void sm3_mb_asimd_x1(SM3_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sm3_mb_mgr_init_asimd(SM3_MB_JOB_MGR * state)
+{
+ unsigned int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > SM3_MB_CE_MAX_LANES is invalid lane
+ for (; i < SM3_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i;
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+ if (state->num_lanes_inuse == 4) {
+ len = min(min(state->lens[0], state->lens[1]),
+ min(state->lens[2], state->lens[3]));
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+ sm3_mb_asimd_x4(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane,
+ state->ldata[3].job_in_lane, len >> 4);
+ //only return the min length job
+ for (i = 0; i < SM3_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+
+ return lane_idx;
+ } else {
+ for (i = 0; i < SM3_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ len = state->lens[i] & (~0xf);
+ sm3_mb_asimd_x1(state->ldata[i].job_in_lane, len >> 4);
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ return i;
+ }
+ }
+ }
+ return -1;
+
+}
+
+static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state)
+{
+ int i;
+ SM3_JOB *ret = NULL;
+
+ for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ //fatal error
+ assert(lane_idx < SM3_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SM3_JOB *sm3_mb_mgr_submit_asimd(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SM3_JOB *ret;
+
+ //add job into lanes
+ sm3_mb_mgr_insert_job(state, job);
+
+ ret = sm3_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sm3_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sm3_mb_mgr_do_jobs(state);
+#endif
+
+ //~ i = lane_idx;
+ ret = sm3_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SM3_JOB *sm3_mb_mgr_flush_asimd(SM3_MB_JOB_MGR * state)
+{
+ SM3_JOB *ret;
+ ret = sm3_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sm3_mb_mgr_do_jobs(state);
+ return sm3_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c
new file mode 100644
index 000000000..a7178e0be
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_mgr_sm_aarch64.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stddef.h>
+#include <sm3_mb.h>
+#include <assert.h>
+
+#ifndef max
+#define max(a,b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define SM3_MB_CE_MAX_LANES 4
+#if SM3_MB_CE_MAX_LANES >=4
+void sm3_mb_sm_x4(SM3_JOB *, SM3_JOB *, SM3_JOB *, SM3_JOB *, int);
+#endif
+#if SM3_MB_CE_MAX_LANES >=3
+void sm3_mb_sm_x3(SM3_JOB *, SM3_JOB *, SM3_JOB *, int);
+#endif
+#if SM3_MB_CE_MAX_LANES >=2
+void sm3_mb_sm_x2(SM3_JOB *, SM3_JOB *, int);
+#endif
+void sm3_mb_sm_x1(SM3_JOB *, int);
+
+#define LANE_IS_NOT_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FINISHED(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane!=NULL)
+#define LANE_IS_FREE(state,i) \
+ (((state->lens[i]&(~0xf))==0) && state->ldata[i].job_in_lane==NULL)
+#define LANE_IS_INVALID(state,i) \
+ (((state->lens[i]&(~0xf))!=0) && state->ldata[i].job_in_lane==NULL)
+void sm3_mb_mgr_init_sm(SM3_MB_JOB_MGR * state)
+{
+ unsigned int i;
+
+ state->unused_lanes = 0xf;
+ state->num_lanes_inuse = 0;
+ for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= SM3_MB_CE_MAX_LANES - 1 - i;
+ state->lens[i] = i;
+ state->ldata[i].job_in_lane = 0;
+ }
+
+ //lanes > SM3_MB_CE_MAX_LANES is invalid lane
+ for (; i < SM3_MAX_LANES; i++) {
+ state->lens[i] = 0xf;
+ state->ldata[i].job_in_lane = 0;
+ }
+}
+
+static int sm3_mb_mgr_do_jobs(SM3_MB_JOB_MGR * state)
+{
+ int lane_idx, len, i, lanes;
+
+ int lane_idx_array[SM3_MAX_LANES];
+
+ if (state->num_lanes_inuse == 0) {
+ return -1;
+ }
+#if SM3_MB_CE_MAX_LANES == 4
+ if (state->num_lanes_inuse == 4) {
+ len = min(min(state->lens[0], state->lens[1]),
+ min(state->lens[2], state->lens[3]));
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sm3_mb_sm_x4(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane,
+ state->ldata[3].job_in_lane, len >> 4);
+
+ } else
+#elif SM3_MB_CE_MAX_LANES == 3
+ if (state->num_lanes_inuse == 3) {
+ len = min(min(state->lens[0], state->lens[1]), state->lens[2]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+
+ sm3_mb_sm_x3(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane,
+ state->ldata[2].job_in_lane, len >> 4);
+
+ } else
+#elif SM3_MB_CE_MAX_LANES == 2
+ if (state->num_lanes_inuse == 2) {
+ len = min(state->lens[0], state->lens[1]);
+ lane_idx = len & 0xf;
+ len &= ~0xf;
+ sm3_mb_sm_x2(state->ldata[0].job_in_lane,
+ state->ldata[1].job_in_lane, len >> 4);
+
+ } else
+#endif
+ {
+ lanes = 0, len = 0;
+ for (i = 0; i < SM3_MAX_LANES && lanes < state->num_lanes_inuse; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ if (lanes)
+ len = min(len, state->lens[i]);
+ else
+ len = state->lens[i];
+ lane_idx_array[lanes] = i;
+ lanes++;
+ }
+ }
+ if (lanes == 0)
+ return -1;
+ lane_idx = len & 0xf;
+ len = len & (~0xf);
+#if SM3_MB_CE_MAX_LANES >=4
+ if (lanes == 4) {
+ sm3_mb_sm_x4(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane,
+ state->ldata[lane_idx_array[3]].job_in_lane, len >> 4);
+ } else
+#endif
+#if SM3_MB_CE_MAX_LANES >=3
+ if (lanes == 3) {
+ sm3_mb_sm_x3(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane,
+ state->ldata[lane_idx_array[2]].job_in_lane, len >> 4);
+ } else
+#endif
+#if SM3_MB_CE_MAX_LANES >=2
+ if (lanes == 2) {
+ sm3_mb_sm_x2(state->ldata[lane_idx_array[0]].job_in_lane,
+ state->ldata[lane_idx_array[1]].job_in_lane, len >> 4);
+ } else
+#endif
+ {
+ sm3_mb_sm_x1(state->ldata[lane_idx_array[0]].job_in_lane, len >> 4);
+ }
+ }
+ //only return the min length job
+ for (i = 0; i < SM3_MAX_LANES; i++) {
+ if (LANE_IS_NOT_FINISHED(state, i)) {
+ state->lens[i] -= len;
+ state->ldata[i].job_in_lane->len -= len;
+ state->ldata[i].job_in_lane->buffer += len << 2;
+ }
+ }
+
+ return lane_idx;
+
+}
+
+static SM3_JOB *sm3_mb_mgr_free_lane(SM3_MB_JOB_MGR * state)
+{
+ int i;
+ SM3_JOB *ret = NULL;
+
+ for (i = 0; i < SM3_MB_CE_MAX_LANES; i++) {
+ if (LANE_IS_FINISHED(state, i)) {
+
+ state->unused_lanes <<= 4;
+ state->unused_lanes |= i;
+ state->num_lanes_inuse--;
+ ret = state->ldata[i].job_in_lane;
+ ret->status = STS_COMPLETED;
+ state->ldata[i].job_in_lane = NULL;
+ break;
+ }
+ }
+ return ret;
+}
+
+static void sm3_mb_mgr_insert_job(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+ int lane_idx;
+ //add job into lanes
+ lane_idx = state->unused_lanes & 0xf;
+ //fatal error
+ assert(lane_idx < SM3_MB_CE_MAX_LANES);
+ state->lens[lane_idx] = (job->len << 4) | lane_idx;
+ state->ldata[lane_idx].job_in_lane = job;
+ state->unused_lanes >>= 4;
+ state->num_lanes_inuse++;
+}
+
+SM3_JOB *sm3_mb_mgr_submit_sm(SM3_MB_JOB_MGR * state, SM3_JOB * job)
+{
+#ifndef NDEBUG
+ int lane_idx;
+#endif
+ SM3_JOB *ret;
+
+ //add job into lanes
+ sm3_mb_mgr_insert_job(state, job);
+
+ ret = sm3_mb_mgr_free_lane(state);
+ if (ret != NULL) {
+ return ret;
+ }
+ //submit will wait all lane has data
+ if (state->num_lanes_inuse < SM3_MB_CE_MAX_LANES)
+ return NULL;
+#ifndef NDEBUG
+ lane_idx = sm3_mb_mgr_do_jobs(state);
+ assert(lane_idx != -1);
+#else
+ sm3_mb_mgr_do_jobs(state);
+#endif
+
+ ret = sm3_mb_mgr_free_lane(state);
+ return ret;
+}
+
+SM3_JOB *sm3_mb_mgr_flush_sm(SM3_MB_JOB_MGR * state)
+{
+ SM3_JOB *ret;
+ ret = sm3_mb_mgr_free_lane(state);
+ if (ret) {
+ return ret;
+ }
+
+ sm3_mb_mgr_do_jobs(state);
+ return sm3_mb_mgr_free_lane(state);
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S
new file mode 100644
index 000000000..836bd9ccc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_multibinary_aarch64.S
@@ -0,0 +1,36 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#include "aarch64_multibinary.h"
+
+
+mbin_interface sm3_ctx_mgr_submit
+mbin_interface sm3_ctx_mgr_init
+mbin_interface sm3_ctx_mgr_flush
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S
new file mode 100644
index 000000000..f92ac5e9f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x1.S
@@ -0,0 +1,237 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a+sm4
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ ext v\msg4\().16b, v\msg1\().16b, v\msg2\().16b, #12
+ ext v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b, #12
+ ext v\tmp1\().16b, v\msg2\().16b, v\msg3\().16b, #8
+ sm3partw1 v\msg4\().4s, v\msg0\().4s, v\msg3\().4s
+ sm3partw2 v\msg4\().4s, v\tmp1\().4s, v\tmp0\().4s
+
+.endm
+
+.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req
+ eor v\tmp0\().16b, v\msg0\().16b, v\msg1\().16b
+
+
+ sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[0]
+ sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[0]
+
+ sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[1]
+ sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[1]
+
+ sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[2]
+ sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[2]
+
+ sm3ss1 v\tmp1\().4s, v\dig0\().4s, v\dig1\().4s, v\const\().4s
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ sm3tt1\ab v\dig0\().4s, v\tmp1\().4s, v\tmp0\().4s[3]
+ sm3tt2\ab v\dig1\().4s, v\tmp1\().4s, v\msg0\().4s[3]
+
+.endm
+
+.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1
+ quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1
+.endm
+ job .req x0
+ len .req x1
+ data .req x2
+ digest .req x0
+ end_ptr .req x1
+
+
+ declare_var_vector_reg msg0,0
+ declare_var_vector_reg msg1,1
+ declare_var_vector_reg msg2,2
+ declare_var_vector_reg msg3,3
+ declare_var_vector_reg msg4,4
+ declare_var_vector_reg dig0,5
+ declare_var_vector_reg dig1,6
+ declare_var_vector_reg backup_dig0, 7
+
+ declare_var_vector_reg tmp0,16
+ declare_var_vector_reg tmp1,17
+ declare_var_vector_reg backup_dig1, 18
+
+ declare_var_vector_reg const0,19
+ declare_var_vector_reg const1,20
+ declare_var_vector_reg const2,21
+ declare_var_vector_reg const3,22
+ declare_var_vector_reg const4,23
+ declare_var_vector_reg const5,24
+ declare_var_vector_reg const6,25
+ declare_var_vector_reg const7,26
+ declare_var_vector_reg const8,27
+ declare_var_vector_reg const9,28
+ declare_var_vector_reg const10,29
+ declare_var_vector_reg const11,30
+
+
+
+
+ .global sm3_mb_sm_x1
+ .type sm3_mb_sm_x1, %function
+sm3_mb_sm_x1:
+ adrp x3,.consts
+ ldr data, [job],64
+ add x3,x3,:lo12:.consts
+ ldp qdig0,qdig1,[digest]
+ ld1 {vconst0.16b-vconst3.16b},[x3],64
+ add end_ptr,data,len,lsl 6
+ ld1 {vconst4.16b-vconst7.16b},[x3],64
+ //rev128
+ ext vdig0.16b,vdig0.16b,vdig0.16b,#8
+ ext vdig1.16b,vdig1.16b,vdig1.16b,#8
+ ld1 {vconst8.16b-vconst11.16b},[x3],64
+ rev64 vdig0.16b,vdig0.16b
+ rev64 vdig1.16b,vdig1.16b
+
+
+start_loop:
+ mov vbackup_dig0.16b,vdig0.16b
+ mov vbackup_dig1.16b,vdig1.16b
+ ldp qmsg0,qmsg1,[data],32
+ ldp qmsg2,qmsg3,[data],32
+
+ // big-endian to little-endian
+ rev32 vmsg0.16b,vmsg0.16b
+ rev32 vmsg1.16b,vmsg1.16b
+ rev32 vmsg2.16b,vmsg2.16b
+ rev32 vmsg3.16b,vmsg3.16b
+
+ quad_round_expand a, const0, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand a, const1, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand a, const2, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand a, const3, dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const4, dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const5, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const6, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const7, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand b, const8, dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const9, dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const4, dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+
+ quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1
+ cmp data,end_ptr
+ quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1
+ quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1
+
+ eor vdig0.16b,vdig0.16b,vbackup_dig0.16b
+ eor vdig1.16b,vdig1.16b,vbackup_dig1.16b
+
+
+ bcc start_loop
+
+ //rev128
+ ext vdig0.16b,vdig0.16b,vdig0.16b,#8
+ ext vdig1.16b,vdig1.16b,vdig1.16b,#8
+ rev64 vdig0.16b,vdig0.16b
+ rev64 vdig1.16b,vdig1.16b
+ str qdig0,[digest]
+ str qdig1,[digest,16]
+ ret
+ dsb ish
+ isb
+ .align 2
+.consts:
+ .word 0xce6228cb // 3
+ .word 0xe7311465 // 2
+ .word 0xf3988a32 // 1
+ .word 0x79cc4519 // 0
+ .word 0xe6228cbc // 7
+ .word 0x7311465e // 6
+ .word 0x3988a32f // 5
+ .word 0x9cc45197 // 4
+ .word 0x6228cbce //11
+ .word 0x311465e7 //10
+ .word 0x988a32f3 // 9
+ .word 0xcc451979 // 8
+ .word 0x228cbce6 //15
+ .word 0x11465e73 //14
+ .word 0x88a32f39 //13
+ .word 0xc451979c //12
+ .word 0xec53d43c //19
+ .word 0x7629ea1e //18
+ .word 0x3b14f50f //17
+ .word 0x9d8a7a87 //16
+ .word 0xc53d43ce //23
+ .word 0x629ea1e7 //22
+ .word 0xb14f50f3 //21
+ .word 0xd8a7a879 //20
+ .word 0x53d43cec //27
+ .word 0x29ea1e76 //26
+ .word 0x14f50f3b //25
+ .word 0x8a7a879d //24
+ .word 0x3d43cec5 //31
+ .word 0x9ea1e762 //30
+ .word 0x4f50f3b1 //29
+ .word 0xa7a879d8 //28
+ .word 0xd43cec53 //35
+ .word 0xea1e7629 //34
+ .word 0xf50f3b14 //33
+ .word 0x7a879d8a //32
+ .word 0x43cec53d //39
+ .word 0xa1e7629e //38
+ .word 0x50f3b14f //37
+ .word 0xa879d8a7 //36
+ .word 0x3cec53d4 //43
+ .word 0x1e7629ea //42
+ .word 0x0f3b14f5 //41
+ .word 0x879d8a7a //40
+ .word 0xcec53d43 //47
+ .word 0xe7629ea1 //46
+ .word 0xf3b14f50 //45
+ .word 0x79d8a7a8 //44
+ .word 0xec53d43c //51
+ .word 0x7629ea1e //50
+ .word 0x3b14f50f //49
+
+
+ .size sm3_mb_sm_x1, .-sm3_mb_sm_x1
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S
new file mode 100644
index 000000000..4e4a6e738
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x2.S
@@ -0,0 +1,344 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a+sm4
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+.macro do_ext job,arg0,arg1,arg2,arg3
+ ext v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b,\arg3
+.endm
+.macro do_sm3partw1 job,msg4,msg0,msg3
+ sm3partw1 v\job\()_\msg4\().4s, v\job\()_\msg0\().4s, v\job\()_\msg3\().4s
+.endm
+.macro do_sm3partw2 job,msg4,tmp1,tmp0
+ sm3partw2 v\job\()_\msg4\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s
+.endm
+
+.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ .irp j,0,1
+ do_ext job\j,\msg4,\msg1,\msg2,#12
+ .endr
+ .irp j,0,1
+ do_ext job\j,\tmp0,\msg0,\msg1,#12
+ .endr
+ .irp j,0,1
+ do_ext job\j,\tmp1,\msg2,\msg3,#8
+ .endr
+
+ .irp j,0,1
+ do_sm3partw1 job\j,\msg4, \msg0, \msg3
+ .endr
+ .irp j,0,1
+ do_sm3partw2 job\j,\msg4, \tmp1, \tmp0
+ .endr
+
+.endm
+
+.macro do_eor job,arg0,arg1,arg2
+ eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b
+.endm
+.macro do_sm3ss1 job,tmp1,dig0,dig1,const
+ sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s
+.endm
+
+.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane
+ sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane]
+
+.endm
+.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane
+ sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane]
+.endm
+
+.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req
+ .irp j,0,1
+ do_eor job\j,\tmp0,\msg0,\msg1
+ .endr
+ .irp lane,0,1,2,3
+ .irp j,0,1
+ do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const
+ .endr
+
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ .irp j,0,1
+ do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane
+ .endr
+ .irp j,0,1
+ do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane
+ .endr
+ .endr
+.endm
+
+.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1
+ quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1
+.endm
+
+/*
+ Variables
+*/
+ job0 .req x0
+ job1 .req x1
+ len .req x2
+
+ job0_data .req x3
+ job1_data .req x4
+ job0_digest .req x0
+ job1_digest .req x1
+
+ const_adr .req x5
+ end_ptr .req x2
+
+ declare_var_vector_reg job0_msg0, 0
+ declare_var_vector_reg job0_msg1, 1
+ declare_var_vector_reg job0_msg2, 2
+ declare_var_vector_reg job0_msg3, 3
+ declare_var_vector_reg job0_msg4, 4
+ declare_var_vector_reg job0_dig0, 5
+ declare_var_vector_reg job0_dig1, 6
+ declare_var_vector_reg job0_tmp0, 7
+ declare_var_vector_reg job0_tmp1, 8
+ declare_var_vector_reg job0_backup_dig0, 9
+ declare_var_vector_reg job0_backup_dig1, 10
+
+ declare_var_vector_reg job1_msg0, 11
+ declare_var_vector_reg job1_msg1, 12
+ declare_var_vector_reg job1_msg2, 13
+ declare_var_vector_reg job1_msg3, 14
+ declare_var_vector_reg job1_msg4, 15
+ declare_var_vector_reg job1_dig0, 16
+ declare_var_vector_reg job1_dig1, 17
+ declare_var_vector_reg job1_tmp0, 18
+ declare_var_vector_reg job1_tmp1, 19
+ declare_var_vector_reg job1_backup_dig0, 20
+ declare_var_vector_reg job1_backup_dig1, 21
+
+ declare_var_vector_reg const0, 22
+ declare_var_vector_reg const1, 23
+ declare_var_vector_reg const2, 24
+ declare_var_vector_reg const3, 25
+ declare_var_vector_reg const4, 26
+ declare_var_vector_reg const5, 27
+ declare_var_vector_reg const6, 28
+ declare_var_vector_reg const7, 29
+ declare_var_vector_reg const8, 30
+ declare_var_vector_reg const9, 31
+ declare_var_vector_reg const10, 22
+ declare_var_vector_reg const11, 23
+
+.macro do_rev32_msg job:req,msg:req
+ rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b
+.endm
+.macro do_rev32_job job:req
+ .irp m,0,1,2,3
+ do_rev32_msg \job,msg\m
+ .endr
+.endm
+.macro rev32_msgs
+ .irp j,0,1
+ do_rev32_job job\j
+ .endr
+.endm
+
+
+ .global sm3_mb_sm_x2
+ .type sm3_mb_sm_x2, %function
+sm3_mb_sm_x2:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+
+
+ adrp const_adr,.consts
+ ldr job0_data, [job0],64
+ add const_adr,const_adr,:lo12:.consts
+ ldr job1_data, [job1],64
+ ldp qjob0_dig0,qjob0_dig1,[job0_digest]
+ ldp qjob1_dig0,qjob1_dig1,[job1_digest]
+
+ ldp qconst2,qconst3,[const_adr,32]
+ ldp qconst4,qconst5,[const_adr,64]
+ ldp qconst6,qconst7,[const_adr,96]
+ ldp qconst8,qconst9,[const_adr,128]
+ add end_ptr,job0_data,len,lsl 6
+
+ //rev128
+ ext vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8
+ ext vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8
+ rev64 vjob0_dig0.16b,vjob0_dig0.16b
+ rev64 vjob0_dig1.16b,vjob0_dig1.16b
+ ext vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8
+ ext vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8
+ rev64 vjob1_dig0.16b,vjob1_dig0.16b
+ rev64 vjob1_dig1.16b,vjob1_dig1.16b
+
+
+
+
+
+start_loop:
+
+ ld1 {vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64
+ ld1 {vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64
+
+ mov vjob0_backup_dig0.16b,vjob0_dig0.16b
+ mov vjob0_backup_dig1.16b,vjob0_dig1.16b
+ mov vjob1_backup_dig0.16b,vjob1_dig0.16b
+ mov vjob1_backup_dig1.16b,vjob1_dig1.16b
+
+ // const10,const11,const0,const1 share registers
+ ldp qconst0,qconst1,[const_adr]
+
+ // big-endian to little-endian
+ rev32_msgs
+
+ cmp job0_data,end_ptr
+ quad_round_expand a, const0 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+
+
+ quad_round_expand a, const1 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ // const10,const11,const0,const1 share registers
+ ldp qconst10,qconst11,[const_adr,160]
+ quad_round_expand a, const2 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand a, const3 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const4 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const5 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const6 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const7 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand b, const8 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const9 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const4 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+
+ quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1
+
+ quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1
+ quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1
+
+ eor vjob0_dig0.16b,vjob0_dig0.16b,vjob0_backup_dig0.16b
+ eor vjob0_dig1.16b,vjob0_dig1.16b,vjob0_backup_dig1.16b
+ eor vjob1_dig0.16b,vjob1_dig0.16b,vjob1_backup_dig0.16b
+ eor vjob1_dig1.16b,vjob1_dig1.16b,vjob1_backup_dig1.16b
+
+
+ bcc start_loop
+
+ //rev128
+ ext vjob0_dig0.16b,vjob0_dig0.16b,vjob0_dig0.16b,#8
+ ext vjob0_dig1.16b,vjob0_dig1.16b,vjob0_dig1.16b,#8
+ rev64 vjob0_dig0.16b,vjob0_dig0.16b
+ rev64 vjob0_dig1.16b,vjob0_dig1.16b
+ stp qjob0_dig0,qjob0_dig1,[job0_digest]
+
+ ext vjob1_dig0.16b,vjob1_dig0.16b,vjob1_dig0.16b,#8
+ ext vjob1_dig1.16b,vjob1_dig1.16b,vjob1_dig1.16b,#8
+ rev64 vjob1_dig0.16b,vjob1_dig0.16b
+ rev64 vjob1_dig1.16b,vjob1_dig1.16b
+ stp qjob1_dig0,qjob1_dig1,[job1_digest]
+
+#if 1
+ mov v0.16b,vjob1_dig0.16b
+ mov v1.16b,vjob1_dig1.16b
+ b exit_ret
+#endif
+
+exit_ret:
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .align 2
+.consts:
+ .word 0xce6228cb // 3
+ .word 0xe7311465 // 2
+ .word 0xf3988a32 // 1
+ .word 0x79cc4519 // 0
+ .word 0xe6228cbc // 7
+ .word 0x7311465e // 6
+ .word 0x3988a32f // 5
+ .word 0x9cc45197 // 4
+ .word 0x6228cbce //11
+ .word 0x311465e7 //10
+ .word 0x988a32f3 // 9
+ .word 0xcc451979 // 8
+ .word 0x228cbce6 //15
+ .word 0x11465e73 //14
+ .word 0x88a32f39 //13
+ .word 0xc451979c //12
+ .word 0xec53d43c //19
+ .word 0x7629ea1e //18
+ .word 0x3b14f50f //17
+ .word 0x9d8a7a87 //16
+ .word 0xc53d43ce //23
+ .word 0x629ea1e7 //22
+ .word 0xb14f50f3 //21
+ .word 0xd8a7a879 //20
+ .word 0x53d43cec //27
+ .word 0x29ea1e76 //26
+ .word 0x14f50f3b //25
+ .word 0x8a7a879d //24
+ .word 0x3d43cec5 //31
+ .word 0x9ea1e762 //30
+ .word 0x4f50f3b1 //29
+ .word 0xa7a879d8 //28
+ .word 0xd43cec53 //35
+ .word 0xea1e7629 //34
+ .word 0xf50f3b14 //33
+ .word 0x7a879d8a //32
+ .word 0x43cec53d //39
+ .word 0xa1e7629e //38
+ .word 0x50f3b14f //37
+ .word 0xa879d8a7 //36
+ .word 0x3cec53d4 //43
+ .word 0x1e7629ea //42
+ .word 0x0f3b14f5 //41
+ .word 0x879d8a7a //40
+ .word 0xcec53d43 //47
+ .word 0xe7629ea1 //46
+ .word 0xf3b14f50 //45
+ .word 0x79d8a7a8 //44
+ .word 0xec53d43c //51
+ .word 0x7629ea1e //50
+ .word 0x3b14f50f //49
+
+
+ .size sm3_mb_sm_x2, .-sm3_mb_sm_x2
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S
new file mode 100644
index 000000000..58758f98d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x3.S
@@ -0,0 +1,368 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ //dsdf
+ .arch armv8.2-a+sm4
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+.macro do_ext job,arg0,arg1,arg2,arg3
+ ext vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3
+.endm
+.macro do_sm3partw1 job,msg4,msg0,msg3
+ sm3partw1 vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s
+.endm
+.macro do_sm3partw2 job,msg4,tmp1,tmp0
+ sm3partw2 vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s
+.endm
+
+.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ .irp j,0,1,2
+ do_ext \j,\msg4,\msg1,\msg2,#12
+ .endr
+ .irp j,0,1,2
+ do_ext \j,\tmp0,\msg0,\msg1,#12
+ .endr
+ .irp j,0,1,2
+ do_ext \j,\tmp1,\msg2,\msg3,#8
+ .endr
+
+ .irp j,0,1,2
+ do_sm3partw1 \j,\msg4, \msg0, \msg3
+ .endr
+ .irp j,0,1,2
+ do_sm3partw2 \j,\msg4, \tmp1, \tmp0
+ .endr
+
+.endm
+
+.macro do_eor job,arg0,arg1,arg2
+ eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b
+.endm
+.macro do_sm3ss1 job,tmp1,dig0,dig1,const
+ sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s
+.endm
+
+.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane
+ sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane]
+
+.endm
+.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane
+ sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane]
+.endm
+.macro do_ld_backup_digest job
+ ldp qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off]
+.endm
+
+.macro do_st_digest job
+ stp qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest]
+.endm
+.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,load_digest
+ .irp j,0,1,2
+ do_eor job\j,\tmp0,\msg0,\msg1
+ .ifnb \load_digest
+ do_ld_backup_digest \j
+ .endif
+ .endr
+ .irp lane,0,1,2,3
+ .irp j,0,1,2
+ do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const
+ .endr
+
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ .irp j,0,1,2
+ do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane
+ .endr
+ .irp j,0,1,2
+ do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane
+ .endr
+
+ .endr
+.endm
+
+.macro quad_round_expand ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ message_expand \msg0,\msg1,\msg2,\msg3,\msg4,\tmp0,\tmp1
+ quad_round \ab,\const,\dig0,\dig1,\msg0,\msg1,\tmp0,\tmp1
+.endm
+
+/*
+ Variables
+*/
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ len .req x3
+
+ job0_data .req x4
+ job1_data .req x5
+ job2_data .req x6
+ job0_digest .req x0
+ job1_digest .req x1
+ job2_digest .req x2
+
+ const_adr .req x7
+ end_ptr .req x3
+
+ declare_var_vector_reg job0_msg0, 0
+ declare_var_vector_reg job0_msg1, 1
+ declare_var_vector_reg job0_msg2, 2
+ declare_var_vector_reg job0_msg3, 3
+ declare_var_vector_reg job0_msg4, 4
+ declare_var_vector_reg job0_dig0, 5
+ declare_var_vector_reg job0_dig1, 6
+ declare_var_vector_reg job0_tmp0, 7
+ declare_var_vector_reg job0_tmp1, 8
+ .set job0_dig_off, 64
+ declare_var_vector_reg job0_backup_dig0, 2
+ declare_var_vector_reg job0_backup_dig1, 3
+
+ declare_var_vector_reg job1_msg0, 9
+ declare_var_vector_reg job1_msg1, 10
+ declare_var_vector_reg job1_msg2, 11
+ declare_var_vector_reg job1_msg3, 12
+ declare_var_vector_reg job1_msg4, 13
+ declare_var_vector_reg job1_dig0, 14
+ declare_var_vector_reg job1_dig1, 15
+ declare_var_vector_reg job1_tmp0, 16
+ declare_var_vector_reg job1_tmp1, 17
+ .set job1_dig_off, 96
+ declare_var_vector_reg job1_backup_dig0, 11
+ declare_var_vector_reg job1_backup_dig1, 12
+
+ declare_var_vector_reg job2_msg0, 18
+ declare_var_vector_reg job2_msg1, 19
+ declare_var_vector_reg job2_msg2, 20
+ declare_var_vector_reg job2_msg3, 21
+ declare_var_vector_reg job2_msg4, 22
+ declare_var_vector_reg job2_dig0, 23
+ declare_var_vector_reg job2_dig1, 24
+ declare_var_vector_reg job2_tmp0, 25
+ declare_var_vector_reg job2_tmp1, 26
+ .set job2_dig_off, 128
+ declare_var_vector_reg job2_backup_dig0, 20
+ declare_var_vector_reg job2_backup_dig1, 21
+
+
+ declare_var_vector_reg const0, 27
+ declare_var_vector_reg const1, 28
+ declare_var_vector_reg const2, 29
+ declare_var_vector_reg const3, 30
+ declare_var_vector_reg const4, 27
+ declare_var_vector_reg const5, 28
+ declare_var_vector_reg const6, 29
+ declare_var_vector_reg const7, 30
+ declare_var_vector_reg const8, 27
+ declare_var_vector_reg const9, 28
+ declare_var_vector_reg const10, 29
+ declare_var_vector_reg const11, 30
+
+.macro do_rev32_msg job:req,msg:req
+ rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b
+.endm
+.macro do_rev32_job job:req
+ .irp m,0,1,2,3
+ do_rev32_msg \job,msg\m
+ .endr
+.endm
+.macro rev32_msgs
+ .irp j,0,1,2
+ do_rev32_job job\j
+ .endr
+.endm
+
+.macro do_rev64 job,regd,regn
+ rev64 vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b
+.endm
+
+ .global sm3_mb_sm_x3
+ .type sm3_mb_sm_x3, %function
+sm3_mb_sm_x3:
+ //push d8~d15
+ stp d8,d9,[sp,-192]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+
+
+ adrp const_adr,.consts
+ ldr job0_data, [job0],64
+ add const_adr,const_adr,:lo12:.consts
+ ldr job1_data, [job1],64
+ ldr job2_data, [job2],64
+
+ ldp qjob0_dig0,qjob0_dig1,[job0_digest]
+ ldp qjob1_dig0,qjob1_dig1,[job1_digest]
+ ldp qjob2_dig0,qjob2_dig1,[job2_digest]
+ ld1 {vconst0.16b-vconst3.16b},[const_adr]
+ add end_ptr,job0_data,len,lsl 6
+
+ //rev128
+ .irp j,0,1,2
+ do_ext \j,dig0,dig0,dig0,#8
+ do_ext \j,dig1,dig1,dig1,#8
+ do_rev64 \j,dig0,dig0
+ do_rev64 \j,dig1,dig1
+ .endr
+
+
+
+
+
+start_loop:
+
+ ld1 {vjob0_msg0.16b-vjob0_msg3.16b},[job0_data],64
+ stp qjob0_dig0,qjob0_dig1,[sp,job0_dig_off]
+ ld1 {vjob1_msg0.16b-vjob1_msg3.16b},[job1_data],64
+ stp qjob1_dig0,qjob1_dig1,[sp,job1_dig_off]
+ ld1 {vjob2_msg0.16b-vjob2_msg3.16b},[job2_data],64
+ stp qjob2_dig0,qjob2_dig1,[sp,job2_dig_off]
+
+ cmp job0_data,end_ptr
+
+ // big-endian to little-endian
+ rev32_msgs
+
+ quad_round_expand a, const0 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand a, const1 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+
+ ldp qconst4,qconst5,[const_adr,4*16]
+ quad_round_expand a, const2 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ quad_round_expand a, const3 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+
+ ldp qconst6,qconst7,[const_adr,6*16]
+ quad_round_expand b, const4 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ quad_round_expand b, const5 , dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ ldp qconst8,qconst9,[const_adr,8*16]
+ quad_round_expand b, const6 , dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ quad_round_expand b, const7 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ ldp qconst10,qconst11,[const_adr,10*16]
+ quad_round_expand b, const8 , dig0, dig1, msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ quad_round_expand b, const9 , dig0, dig1, msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+
+ ldp qconst4,qconst5,[const_adr,4*16]
+ quad_round_expand b, const10, dig0, dig1, msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ quad_round_expand b, const11, dig0, dig1, msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ ldp qconst6,qconst7,[const_adr,6*16]
+ quad_round_expand b, const4 , dig0, dig1, msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+ quad_round b, const5, dig0, dig1, msg3, msg4, tmp0, tmp1
+ ldp qconst0,qconst1,[const_adr]
+ quad_round b, const6, dig0, dig1, msg4, msg0, tmp0, tmp1
+
+ quad_round b, const7, dig0, dig1, msg0, msg1, tmp0, tmp1,1
+ ldp qconst2,qconst3,[const_adr,2*16]
+
+ .irp j,0,1,2
+ do_eor job\j,dig0,dig0,backup_dig0
+ do_eor job\j,dig1,dig1,backup_dig1
+ .endr
+
+ bcc start_loop
+
+ //rev128
+ .irp j,0,1,2
+ do_ext \j,dig0,dig0,dig0,#8
+ do_ext \j,dig1,dig1,dig1,#8
+ do_rev64 \j,dig0,dig0
+ do_rev64 \j,dig1,dig1
+ do_st_digest \j
+ .endr
+
+
+
+exit_ret:
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 192
+ ret
+
+ .align 2
+.consts:
+ .word 0xce6228cb // 3
+ .word 0xe7311465 // 2
+ .word 0xf3988a32 // 1
+ .word 0x79cc4519 // 0
+ .word 0xe6228cbc // 7
+ .word 0x7311465e // 6
+ .word 0x3988a32f // 5
+ .word 0x9cc45197 // 4
+ .word 0x6228cbce //11
+ .word 0x311465e7 //10
+ .word 0x988a32f3 // 9
+ .word 0xcc451979 // 8
+ .word 0x228cbce6 //15
+ .word 0x11465e73 //14
+ .word 0x88a32f39 //13
+ .word 0xc451979c //12
+ .word 0xec53d43c //19
+ .word 0x7629ea1e //18
+ .word 0x3b14f50f //17
+ .word 0x9d8a7a87 //16
+ .word 0xc53d43ce //23
+ .word 0x629ea1e7 //22
+ .word 0xb14f50f3 //21
+ .word 0xd8a7a879 //20
+ .word 0x53d43cec //27
+ .word 0x29ea1e76 //26
+ .word 0x14f50f3b //25
+ .word 0x8a7a879d //24
+ .word 0x3d43cec5 //31
+ .word 0x9ea1e762 //30
+ .word 0x4f50f3b1 //29
+ .word 0xa7a879d8 //28
+ .word 0xd43cec53 //35
+ .word 0xea1e7629 //34
+ .word 0xf50f3b14 //33
+ .word 0x7a879d8a //32
+ .word 0x43cec53d //39
+ .word 0xa1e7629e //38
+ .word 0x50f3b14f //37
+ .word 0xa879d8a7 //36
+ .word 0x3cec53d4 //43
+ .word 0x1e7629ea //42
+ .word 0x0f3b14f5 //41
+ .word 0x879d8a7a //40
+ .word 0xcec53d43 //47
+ .word 0xe7629ea1 //46
+ .word 0xf3b14f50 //45
+ .word 0x79d8a7a8 //44
+ .word 0xec53d43c //51
+ .word 0x7629ea1e //50
+ .word 0x3b14f50f //49
+
+
+ .size sm3_mb_sm_x3, .-sm3_mb_sm_x3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S
new file mode 100644
index 000000000..7f3f1db66
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/aarch64/sm3_mb_sm_x4.S
@@ -0,0 +1,440 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+ .arch armv8.2-a+sm4
+ .text
+ .align 2
+ .p2align 3,,7
+
+.macro declare_var_vector_reg name:req,reg:req
+ q\name\() .req q\reg
+ v\name\() .req v\reg
+ s\name\() .req s\reg
+.endm
+
+.macro do_ext job,arg0,arg1,arg2,arg3
+ ext vjob\job\()_\arg0\().16b,vjob\job\()_\arg1\().16b,vjob\job\()_\arg2\().16b,\arg3
+.endm
+.macro do_sm3partw1 job,msg4,msg0,msg3
+ sm3partw1 vjob\job\()_\msg4\().4s, vjob\job\()_\msg0\().4s, vjob\job\()_\msg3\().4s
+.endm
+.macro do_sm3partw2 job,msg4,tmp1,tmp0
+ sm3partw2 vjob\job\()_\msg4\().4s, vjob\job\()_\tmp1\().4s, vjob\job\()_\tmp0\().4s
+.endm
+
+.macro message_expand msg0:req,msg1:req,msg2:req,msg3:req,msg4:req,tmp0:req,tmp1:req
+ .irp j,0,1,2,3
+ do_ext \j,\msg4,\msg1,\msg2,#12
+ .endr
+ .irp j,0,1,2,3
+ do_ext \j,\tmp0,\msg0,\msg1,#12
+ .endr
+ .irp j,0,1,2,3
+ do_ext \j,\tmp1,\msg2,\msg3,#8
+ .endr
+
+ .irp j,0,1,2,3
+ do_sm3partw1 \j,\msg4, \msg0, \msg3
+ .endr
+ .irp j,0,1,2,3
+ do_sm3partw2 \j,\msg4, \tmp1, \tmp0
+ .endr
+ st1 {vjob0_\msg4\().16b-vjob3_\msg4\().16b},[data_buf],64
+.endm
+
+.macro do_eor job,arg0,arg1,arg2
+ eor v\job\()_\arg0\().16b,v\job\()_\arg1\().16b,v\job\()_\arg2\().16b
+.endm
+.macro do_sm3ss1 job,tmp1,dig0,dig1,const
+ sm3ss1 v\job\()_\tmp1\().4s, v\job\()_\dig0\().4s, v\job\()_\dig1\().4s, v\const\().4s
+.endm
+
+.macro do_sm3tt1 job,ab,dig0,tmp1,tmp0,lane
+ sm3tt1\ab v\job\()_\dig0\().4s, v\job\()_\tmp1\().4s, v\job\()_\tmp0\().4s[\lane]
+
+.endm
+.macro do_sm3tt2 job,ab,dig1,tmp1,msg0,lane
+ sm3tt2\ab v\job\()_\dig1\().4s, v\job\()_\tmp1\().4s, v\job\()_\msg0\().4s[\lane]
+.endm
+.macro do_ld_backup_digest job
+ ldp qjob\job\()_backup_dig0,qjob\job\()_backup_dig1,[sp,job\job\()_dig_off]
+.endm
+
+.macro do_st_digest job
+ stp qjob\job\()_dig0,qjob\job\()_dig1,[job\job\()_digest]
+.endm
+
+.macro quad_round ab:req,const:req,dig0:req,dig1:req,msg0:req,msg1:req,tmp0:req,tmp1:req,is_last
+ .ifnb \is_last
+ ld1 {vjob0_backup_dig0.16b-vjob3_backup_dig0.16b},[dig_buf],64
+ .endif
+
+ .irp j,0,1,2,3
+ do_eor job\j,\tmp0,\msg0,\msg1
+
+ .endr
+
+ .irp lane,0,1,2
+ .irp j,0,1,2,3
+ do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const
+ .endr
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ .irp j,0,1,2,3
+ do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,\lane
+ .endr
+ .irp j,0,1,2,3
+ do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,\lane
+ .endr
+
+
+ .endr
+ .irp j,0,1,2,3
+ do_sm3ss1 job\j,\tmp1,\dig0,\dig1,\const
+ .endr
+ .ifnb \is_last
+
+ ld1 {vjob0_backup_dig1.16b-vjob3_backup_dig1.16b},[dig_buf]
+ .else
+ ext v\const\().16b,v\const\().16b,v\const\().16b,12
+ .endif
+ .irp j,0,1,2,3
+ do_sm3tt2 job\j,\ab,\dig1,\tmp1,\msg0,3
+ .endr
+
+ .irp j,0,1,2,3
+ do_sm3tt1 job\j,\ab,\dig0,\tmp1,\tmp0,3
+ .ifnb \is_last
+ do_eor job\j,dig1,dig1,backup_dig1
+ do_eor job\j,dig0,dig0,backup_dig0
+ .endif
+ .endr
+
+ .ifb \is_last
+ ld1 {vjob0_\msg0\().16b-vjob3_\msg0\().16b},[data_buf],64
+ .endif
+
+.endm
+
+
+
+/*
+ Variables
+*/
+ .set temp_buf_size,(68*4+32)*4
+ .set dig_buf_off,64
+ .set data_buf_off,64+32*4
+ job0 .req x0
+ job1 .req x1
+ job2 .req x2
+ job3 .req x3
+ len .req x4
+
+ job0_data .req x5
+ job1_data .req x6
+ job2_data .req x7
+ job3_data .req x9
+
+ job0_digest .req x0
+ job1_digest .req x1
+ job2_digest .req x2
+ job3_digest .req x3
+
+ const_adr .req x10
+ end_ptr .req x4
+ data_buf .req x11
+ dig_buf .req x12
+
+ declare_var_vector_reg job0_msg0, 0
+ declare_var_vector_reg job1_msg0, 1
+ declare_var_vector_reg job2_msg0, 2
+ declare_var_vector_reg job3_msg0, 3
+ declare_var_vector_reg job0_msg1, 4
+ declare_var_vector_reg job1_msg1, 5
+ declare_var_vector_reg job2_msg1, 6
+ declare_var_vector_reg job3_msg1, 7
+ declare_var_vector_reg job0_msg2, 8
+ declare_var_vector_reg job1_msg2, 9
+ declare_var_vector_reg job2_msg2, 10
+ declare_var_vector_reg job3_msg2, 11
+ declare_var_vector_reg job0_msg3, 12
+ declare_var_vector_reg job1_msg3, 13
+ declare_var_vector_reg job2_msg3, 14
+ declare_var_vector_reg job3_msg3, 15
+ declare_var_vector_reg job0_tmp0, 16
+ declare_var_vector_reg job1_tmp0, 17
+ declare_var_vector_reg job2_tmp0, 18
+ declare_var_vector_reg job3_tmp0, 19
+ declare_var_vector_reg job0_tmp1, 20
+ declare_var_vector_reg job1_tmp1, 21
+ declare_var_vector_reg job2_tmp1, 22
+ declare_var_vector_reg job3_tmp1, 23
+ declare_var_vector_reg job0_msg4, 24
+ declare_var_vector_reg job1_msg4, 25
+ declare_var_vector_reg job2_msg4, 26
+ declare_var_vector_reg job3_msg4, 27
+ declare_var_vector_reg job0_dig0, 8
+ declare_var_vector_reg job1_dig0, 9
+ declare_var_vector_reg job2_dig0, 10
+ declare_var_vector_reg job3_dig0, 11
+ declare_var_vector_reg job0_dig1, 12
+ declare_var_vector_reg job1_dig1, 13
+ declare_var_vector_reg job2_dig1, 14
+ declare_var_vector_reg job3_dig1, 15
+
+ declare_var_vector_reg job0_backup_dig0, 24
+ declare_var_vector_reg job1_backup_dig0, 25
+ declare_var_vector_reg job2_backup_dig0, 26
+ declare_var_vector_reg job3_backup_dig0, 27
+ declare_var_vector_reg job0_backup_dig1, 28
+ declare_var_vector_reg job1_backup_dig1, 29
+ declare_var_vector_reg job2_backup_dig1, 30
+ declare_var_vector_reg job3_backup_dig1, 31
+
+ declare_var_vector_reg const0, 24
+ declare_var_vector_reg const1, 25
+ declare_var_vector_reg const2, 26
+ declare_var_vector_reg const3, 27
+ declare_var_vector_reg const4, 28
+ declare_var_vector_reg const5, 29
+ declare_var_vector_reg const6, 30
+ declare_var_vector_reg const7, 31
+ declare_var_vector_reg const8, 24
+ declare_var_vector_reg const9, 25
+ declare_var_vector_reg const10, 26
+ declare_var_vector_reg const11, 27
+
+.macro do_rev32_msg job:req,msg:req
+ rev32 v\job\()_\msg\().16b,v\job\()_\msg\().16b
+.endm
+
+.macro do_rev32_job job:req
+ .irp m,0,1,2,3
+ do_rev32_msg \job,msg\m
+ .endr
+.endm
+
+.macro rev32_msgs
+ .irp j,0,1,2,3
+ do_rev32_job job\j
+ .endr
+.endm
+
+.macro do_rev64 job,regd,regn
+ rev64 vjob\job\()_\regd\().16b,vjob\job\()_\regd\().16b
+.endm
+
+.macro do_ldp_msg23 job
+ ldp qjob\job\()_msg2,qjob\job\()_msg3,[job\job\()_data],32
+.endm
+
+ .global sm3_mb_sm_x4
+ .type sm3_mb_sm_x4, %function
+sm3_mb_sm_x4:
+ //push d8~d15
+ sub sp,sp,temp_buf_size
+ stp d8,d9,[sp,-64]!
+ stp d10,d11,[sp,16]
+ stp d12,d13,[sp,32]
+ stp d14,d15,[sp,48]
+
+
+
+ ldr job0_data, [job0],64
+ ldr job1_data, [job1],64
+ ldr job2_data, [job2],64
+ ldr job3_data, [job3],64
+
+ ldp qjob0_dig0,qjob0_dig1,[job0_digest]
+ ldp qjob1_dig0,qjob1_dig1,[job1_digest]
+ ldp qjob2_dig0,qjob2_dig1,[job2_digest]
+ ldp qjob3_dig0,qjob3_dig1,[job3_digest]
+ add end_ptr,job0_data,len,lsl 6
+ //rev128,change digest endian
+ .irp j,0,1,2,3
+ do_ext \j,dig0,dig0,dig0,#8
+ do_ext \j,dig1,dig1,dig1,#8
+ do_rev64 \j,dig0,dig0
+ do_rev64 \j,dig1,dig1
+ .endr
+
+
+
+
+start_loop:
+ add dig_buf,sp,dig_buf_off
+ ldp qjob0_msg0,qjob0_msg1,[job0_data],32
+ add data_buf,sp,data_buf_off
+ ldp qjob1_msg0,qjob1_msg1,[job1_data],32
+ st1 {vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64
+ ldp qjob2_msg0,qjob2_msg1,[job2_data],32
+ st1 {vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf]
+ ldp qjob3_msg0,qjob3_msg1,[job3_data],32
+
+ .irp j,0,1,2,3
+ do_ldp_msg23 \j
+ do_rev32_msg job\j,msg0
+ do_rev32_msg job\j,msg1
+ .endr
+ st1 {vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64
+ st1 {vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64
+ .irp j,0,1,2,3
+ do_rev32_msg job\j,msg2
+ do_rev32_msg job\j,msg3
+ .endr
+ st1 {vjob0_msg2.16b-vjob3_msg2.16b},[data_buf],64
+ st1 {vjob0_msg3.16b-vjob3_msg3.16b},[data_buf],64
+
+ cmp job0_data,end_ptr
+
+ /** message expand **/
+ message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ message_expand msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ message_expand msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+ message_expand msg3, msg4, msg0, msg1, msg2, tmp0, tmp1
+ message_expand msg4, msg0, msg1, msg2, msg3, tmp0, tmp1
+ message_expand msg0, msg1, msg2, msg3, msg4, tmp0, tmp1
+ message_expand msg1, msg2, msg3, msg4, msg0, tmp0, tmp1
+ message_expand msg2, msg3, msg4, msg0, msg1, tmp0, tmp1
+
+ /** re-init variables for sm3 rounds **/
+ add dig_buf,sp,dig_buf_off
+ ld1 {vjob0_dig0.16b-vjob3_dig0.16b},[dig_buf],64
+ add data_buf,sp,data_buf_off
+ ld1 {vjob0_dig1.16b-vjob3_dig1.16b},[dig_buf]
+ add dig_buf,sp,dig_buf_off
+ adrp const_adr,.consts
+ ld1 {vjob0_msg0.16b-vjob3_msg0.16b},[data_buf],64
+ add const_adr,const_adr,:lo12:.consts
+ ld1 {vjob0_msg1.16b-vjob3_msg1.16b},[data_buf],64
+ ld1 {vconst0.16b-vconst3.16b},[const_adr],64
+ ld1 {vconst4.16b-vconst7.16b},[const_adr],64
+ /** digests rounds **/
+ quad_round a, const0 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round a, const1 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round a, const2 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round a, const3 , dig0, dig1, msg1, msg0, tmp0, tmp1
+
+ /** share registers with vconst0-vconst3 **/
+ ld1 {vconst8.16b-vconst11.16b},[const_adr]
+
+ quad_round b, const4 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const5 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const6 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const7 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const8 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const9 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const10, dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const11, dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const4 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const5 , dig0, dig1, msg1, msg0, tmp0, tmp1
+ quad_round b, const6 , dig0, dig1, msg0, msg1, tmp0, tmp1
+ quad_round b, const7 , dig0, dig1, msg1, msg0, tmp0, tmp1,1
+
+ bcc start_loop
+
+ //rev128
+ .irp j,0,1,2,3
+ do_ext \j,dig0,dig0,dig0,#8
+ do_ext \j,dig1,dig1,dig1,#8
+ do_rev64 \j,dig0,dig0
+ do_rev64 \j,dig1,dig1
+ do_st_digest \j
+ .endr
+
+
+
+exit_ret:
+ ldp d10,d11,[sp,16]
+ ldp d12,d13,[sp,32]
+ ldp d14,d15,[sp,48]
+ ldp d8, d9, [sp], 64
+ add sp,sp,temp_buf_size
+ ret
+
+ .align 2
+.consts:
+ .word 0xce6228cb // 3
+ .word 0xe7311465 // 2
+ .word 0xf3988a32 // 1
+ .word 0x79cc4519 // 0
+ .word 0xe6228cbc // 7
+ .word 0x7311465e // 6
+ .word 0x3988a32f // 5
+ .word 0x9cc45197 // 4
+ .word 0x6228cbce //11
+ .word 0x311465e7 //10
+ .word 0x988a32f3 // 9
+ .word 0xcc451979 // 8
+ .word 0x228cbce6 //15
+ .word 0x11465e73 //14
+ .word 0x88a32f39 //13
+ .word 0xc451979c //12
+ .word 0xec53d43c //19
+ .word 0x7629ea1e //18
+ .word 0x3b14f50f //17
+ .word 0x9d8a7a87 //16
+ .word 0xc53d43ce //23
+ .word 0x629ea1e7 //22
+ .word 0xb14f50f3 //21
+ .word 0xd8a7a879 //20
+ .word 0x53d43cec //27
+ .word 0x29ea1e76 //26
+ .word 0x14f50f3b //25
+ .word 0x8a7a879d //24
+ .word 0x3d43cec5 //31
+ .word 0x9ea1e762 //30
+ .word 0x4f50f3b1 //29
+ .word 0xa7a879d8 //28
+ .word 0xd43cec53 //35
+ .word 0xea1e7629 //34
+ .word 0xf50f3b14 //33
+ .word 0x7a879d8a //32
+ .word 0x43cec53d //39
+ .word 0xa1e7629e //38
+ .word 0x50f3b14f //37
+ .word 0xa879d8a7 //36
+ .word 0x3cec53d4 //43
+ .word 0x1e7629ea //42
+ .word 0x0f3b14f5 //41
+ .word 0x879d8a7a //40
+ .word 0xcec53d43 //47
+ .word 0xe7629ea1 //46
+ .word 0xf3b14f50 //45
+ .word 0x79d8a7a8 //44
+ .word 0xec53d43c //51
+ .word 0x7629ea1e //50
+ .word 0x3b14f50f //49
+
+
+ .size sm3_mb_sm_x4, .-sm3_mb_sm_x4
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c
new file mode 100644
index 000000000..b1c6ee26b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx2.c
@@ -0,0 +1,284 @@
+/**********************************************************************
+ Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR * state);
+
+void sm3_mb_mgr_init_avx2(SM3_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SM3_X8_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
+
+void sm3_ctx_mgr_init_avx2(SM3_HASH_CTX_MGR * mgr)
+{
+ sm3_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_avx2(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_avx2(SM3_HASH_CTX_MGR * mgr)
+{
+ SM3_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sm3_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SM3_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ unsigned int j;
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ ctx->job.result_digest[j] =
+ byteswap32(ctx->job.result_digest[j]);
+ }
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SM3_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SM3_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { SM3_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+struct slver sm3_ctx_mgr_init_avx2_slver_0000;
+struct slver sm3_ctx_mgr_init_avx2_slver = { 0x2309, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_submit_avx2_slver_0000;
+struct slver sm3_ctx_mgr_submit_avx2_slver = { 0x230a, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_flush_avx2_slver_0000;
+struct slver sm3_ctx_mgr_flush_avx2_slver = { 0x230b, 0x00, 0x00 };
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c
new file mode 100644
index 000000000..8169aa170
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_avx512.c
@@ -0,0 +1,292 @@
+/**********************************************************************
+ Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#if defined(__clang__)
+# pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__ICC)
+# pragma intel optimization_parameter target_arch=AVX2
+#elif defined(__ICL)
+# pragma [intel] optimization_parameter target_arch=AVX2
+#elif (__GNUC__ >= 5)
+# pragma GCC target("avx2")
+#endif
+
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SM3_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len);
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx);
+
+void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state);
+SM3_JOB *sm3_mb_mgr_submit_avx512(SM3_MB_JOB_MGR * state, SM3_JOB * job);
+SM3_JOB *sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR * state);
+
+void sm3_mb_mgr_init_avx512(SM3_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SM3_MAX_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
+
+void sm3_ctx_mgr_init_avx512(SM3_HASH_CTX_MGR * mgr)
+{
+ sm3_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_avx512(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // if partial_block_buffer_length != 0 means ctx get extra data
+ // len < SM3_BLOCK_SIZE means data len < SM3_BLOCK_SIZE
+ if ((ctx->partial_block_buffer_length) | (len < SM3_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SM3_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SM3_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SM3_BLOCK_SIZE) {
+
+ ctx->partial_block_buffer_length = 0;
+ ctx->job.buffer = ctx->partial_block_buffer;
+
+ ctx->job.len = 1;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+
+ }
+
+ return sm3_ctx_mgr_resubmit(mgr, ctx);
+}
+
+static SM3_HASH_CTX *sm3_ctx_mgr_resubmit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ unsigned int j;
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ ctx->job.result_digest[j] =
+ byteswap32(ctx->job.result_digest[j]);
+ }
+ return ctx;
+ }
+ // partial_block_buffer_length must be 0 that means incoming_buffer_length have not be init.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // copy_len will check len % SM3_BLOCK_SIZE ?= 0
+ uint32_t copy_len = len & (SM3_BLOCK_SIZE - 1);
+
+ // if mod SM3_BLOCK_SIZE != 0
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ // store the extra data
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+ // after len -= copy_len or copy_len == 0
+ assert((len % SM3_BLOCK_SIZE) == 0);
+ // get the block size , eq len = len / 64
+ len >>= SM3_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ // todo make sure should return ?
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SM3_BLOCK_SIZE * 2], uint64_t total_len)
+{
+ uint32_t i = (uint32_t) (total_len & (SM3_BLOCK_SIZE - 1));
+
+ memclr_fixedlen(&padblock[i], SM3_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SM3_BLOCK_SIZE - 1) & (0 - (total_len + SM3_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SM3_PADLENGTHFIELD_SIZE;
+
+#if SM3_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = to_be64((uint64_t) total_len << 3);
+
+ return i >> SM3_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_avx512(SM3_HASH_CTX_MGR * mgr)
+{
+
+ SM3_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SM3_HASH_CTX *) sm3_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sm3_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { SM3_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+struct slver sm3_ctx_mgr_init_avx512_slver_0000;
+struct slver sm3_ctx_mgr_init_avx512_slver = { 0x2306, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_submit_avx512_slver_0000;
+struct slver sm3_ctx_mgr_submit_avx512_slver = { 0x2307, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_flush_avx512_slver_0000;
+struct slver sm3_ctx_mgr_flush_avx512_slver = { 0x2308, 0x00, 0x00 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
+#if defined(__clang__)
+# pragma clang attribute pop
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c
new file mode 100644
index 000000000..e8fcfe08a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base.c
@@ -0,0 +1,314 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+#include "endian_helper.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#if (__GNUC__ >= 11)
+# define OPT_FIX __attribute__ ((noipa))
+#else
+# define OPT_FIX
+#endif
+
+#define rol32(x, r) (((x)<<(r)) | ((x)>>(32-(r))))
+
+static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static uint32_t OPT_FIX sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len);
+static void OPT_FIX sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len);
+static void OPT_FIX sm3_single(const volatile void *data, uint32_t digest[]);
+static inline void hash_init_digest(SM3_WORD_T * digest);
+
+static inline uint32_t P0(uint32_t X)
+{
+ return (X ^ (rol32(X, 9)) ^ (rol32(X, 17)));
+}
+
+static inline uint32_t P1(uint32_t X)
+{
+ return (X ^ (rol32(X, 15)) ^ (rol32(X, 23)));
+}
+
+static inline uint32_t sm3_ff(int j, uint32_t x, uint32_t y, uint32_t z)
+{
+ return j < 16 ? (x ^ y ^ z) : ((x & y) | (x & z) | (y & z));
+}
+
+static inline uint32_t sm3_gg(int j, uint32_t x, uint32_t y, uint32_t z)
+{
+ return j < 16 ? (x ^ y ^ z) : ((x & y) | ((~x) & z));
+}
+
+static inline void sm3_message_schedule(uint32_t bi[], volatile uint32_t W[],
+ volatile uint32_t W_B[])
+{
+ int j;
+ volatile uint32_t tmp;
+
+ for (j = 0; j <= 15; j++) {
+ W[j] = to_be32(bi[j]);
+ }
+
+ for (; j <= 67; j++) {
+ tmp = W[j - 16] ^ W[j - 9] ^ rol32(W[j - 3], 15);
+ W[j] = P1(tmp) ^ (rol32(W[j - 13], 7)) ^ W[j - 6];
+ }
+
+ for (j = 0; j < 64; j++) {
+ W_B[j] = W[j] ^ W[j + 4];
+ }
+
+ tmp = 0;
+}
+
+static inline void sm3_compress_step_func(int j, volatile uint32_t * a_p,
+ volatile uint32_t * b_p, volatile uint32_t * c_p,
+ volatile uint32_t * d_p, volatile uint32_t * e_p,
+ volatile uint32_t * f_p, volatile uint32_t * g_p,
+ volatile uint32_t * h_p, volatile uint32_t W[],
+ volatile uint32_t W_B[])
+{
+ volatile uint32_t SS1, SS2, TT1, TT2;
+ uint32_t T = j < 16 ? 0x79cc4519 : 0x7a879d8a;
+
+ SS1 = rol32(rol32(*a_p, 12) + *e_p + rol32(T, (j % 32)), 7);
+ SS2 = SS1 ^ rol32(*a_p, 12);
+ TT1 = sm3_ff(j, *a_p, *b_p, *c_p) + *d_p + SS2 + W_B[j];
+ TT2 = sm3_gg(j, *e_p, *f_p, *g_p) + *h_p + SS1 + W[j];
+ *d_p = *c_p;
+ *c_p = rol32(*b_p, 9);
+ *b_p = *a_p;
+ *a_p = TT1;
+ *h_p = *g_p;
+ *g_p = rol32(*f_p, 19);
+ *f_p = *e_p;
+ *e_p = P0(TT2);
+
+ SS1 = 0;
+ SS2 = 0;
+ TT1 = 0;
+ TT2 = 0;
+}
+
+void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr)
+{
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ uint32_t remain_len;
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_PROCESSING) && (flags == HASH_ENTIRE)) {
+ // Cannot submit a new entire job to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags == HASH_FIRST) {
+ if (len % SM3_BLOCK_SIZE != 0) {
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+ sm3_init(ctx, buffer, len);
+ sm3_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_UPDATE) {
+ if (len % SM3_BLOCK_SIZE != 0) {
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+ sm3_update(ctx, buffer, len);
+ }
+
+ if (flags == HASH_LAST) {
+ remain_len = sm3_update(ctx, buffer, len);
+ sm3_final(ctx, remain_len);
+ }
+
+ if (flags == HASH_ENTIRE) {
+ sm3_init(ctx, buffer, len);
+ remain_len = sm3_update(ctx, buffer, len);
+ sm3_final(ctx, remain_len);
+ }
+
+ return ctx;
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr)
+{
+ return NULL;
+}
+
+static void sm3_init(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Mark it as processing
+ ctx->status = HASH_CTX_STS_PROCESSING;
+}
+
+static uint32_t sm3_update(SM3_HASH_CTX * ctx, const void *buffer, uint32_t len)
+{
+ uint32_t remain_len = len;
+ uint32_t *digest = ctx->job.result_digest;
+
+ while (remain_len >= SM3_BLOCK_SIZE) {
+ sm3_single(buffer, digest);
+ buffer = (void *)((uint8_t *) buffer + SM3_BLOCK_SIZE);
+ remain_len -= SM3_BLOCK_SIZE;
+ ctx->total_length += SM3_BLOCK_SIZE;
+ }
+
+ ctx->incoming_buffer = buffer;
+ return remain_len;
+}
+
+static void sm3_final(SM3_HASH_CTX * ctx, uint32_t remain_len)
+{
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t i = remain_len;
+ uint32_t j;
+ volatile uint8_t buf[2 * SM3_BLOCK_SIZE] = { 0 };
+ uint32_t *digest = ctx->job.result_digest;
+
+ ctx->total_length += i;
+ memcpy((void *)buf, buffer, i);
+ buf[i++] = 0x80;
+
+ i = (i > SM3_BLOCK_SIZE - SM3_PADLENGTHFIELD_SIZE ?
+ 2 * SM3_BLOCK_SIZE : SM3_BLOCK_SIZE);
+
+ *(uint64_t *) (buf + i - 8) = to_be64((uint64_t) ctx->total_length * 8);
+
+ sm3_single(buf, digest);
+ if (i == 2 * SM3_BLOCK_SIZE) {
+ sm3_single(buf + SM3_BLOCK_SIZE, digest);
+ }
+
+ /* convert to small-endian for words */
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ digest[j] = byteswap32(digest[j]);
+ }
+
+ ctx->status = HASH_CTX_STS_COMPLETE;
+ memset((void *)buf, 0, sizeof(buf));
+}
+
+static void sm3_single(const volatile void *data, uint32_t digest[])
+{
+ volatile uint32_t a, b, c, d, e, f, g, h;
+ volatile uint32_t W[68], W_bar[64];
+ int j;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ sm3_message_schedule((uint32_t *) data, W, W_bar);
+ for (j = 0; j < 64; j++) {
+ sm3_compress_step_func(j, &a, &b, &c, &d, &e, &f, &g, &h, W, W_bar);
+ }
+
+ digest[0] ^= a;
+ digest[1] ^= b;
+ digest[2] ^= c;
+ digest[3] ^= d;
+ digest[4] ^= e;
+ digest[5] ^= f;
+ digest[6] ^= g;
+ digest[7] ^= h;
+
+ memset((void *)W, 0, sizeof(W));
+ memset((void *)W_bar, 0, sizeof(W_bar));
+
+ a = 0;
+ b = 0;
+ c = 0;
+ d = 0;
+ e = 0;
+ f = 0;
+ g = 0;
+ h = 0;
+}
+
+static inline void hash_init_digest(SM3_WORD_T * digest)
+{
+ static const SM3_WORD_T hash_initial_digest[SM3_DIGEST_NWORDS] =
+ { SM3_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sm3_ctx_mgr_init_base_slver_0000;
+struct slver sm3_ctx_mgr_init_base_slver = { 0x2303, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_submit_base_slver_0000;
+struct slver sm3_ctx_mgr_submit_base_slver = { 0x2304, 0x00, 0x00 };
+
+struct slver sm3_ctx_mgr_flush_base_slver_0000;
+struct slver sm3_ctx_mgr_flush_base_slver = { 0x2305, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c
new file mode 100644
index 000000000..d74a4c882
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ctx_base_aliases.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ Copyright(c) 2019 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "memcpy_inline.h"
+
+extern void sm3_ctx_mgr_init_base(SM3_HASH_CTX_MGR * mgr);
+extern SM3_HASH_CTX *sm3_ctx_mgr_submit_base(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags);
+extern SM3_HASH_CTX *sm3_ctx_mgr_flush_base(SM3_HASH_CTX_MGR * mgr);
+
+void sm3_ctx_mgr_init(SM3_HASH_CTX_MGR * mgr)
+{
+ return sm3_ctx_mgr_init_base(mgr);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_submit(SM3_HASH_CTX_MGR * mgr, SM3_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ return sm3_ctx_mgr_submit_base(mgr, ctx, buffer, len, flags);
+}
+
+SM3_HASH_CTX *sm3_ctx_mgr_flush(SM3_HASH_CTX_MGR * mgr)
+{
+ return sm3_ctx_mgr_flush_base(mgr);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm
new file mode 100644
index 000000000..0f2a0f39a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_job.asm
@@ -0,0 +1,65 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Threshold constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; if number of lanes in use <= threshold, using sb func
+%define SM3_SB_THRESHOLD_SSE 1
+%define SM3_SB_THRESHOLD_AVX 1
+%define SM3_SB_THRESHOLD_AVX2 1
+%define SM3_SB_THRESHOLD_AVX512 1
+%define SM3_NI_SB_THRESHOLD_SSE 4 ; shani is faster than sse sha256_mb
+%define SM3_NI_SB_THRESHOLD_AVX512 6
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA256_JOB
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 8, 8 ; length in bytes
+FIELD _result_digest, 8*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+
+%assign _SM3_JOB_size _FIELD_OFFSET
+%assign _SM3_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c
new file mode 100644
index 000000000..fbbb2a1a7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_flush_test.c
@@ -0,0 +1,145 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS (SM3_MAX_LANES - 1)
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+uint8_t lens_print_and_check(SM3_HASH_CTX_MGR * mgr)
+{
+ static int32_t last_lens[SM3_MAX_LANES] = { 0 };
+ int32_t len;
+ uint8_t num_unchanged = 0;
+ int i;
+ for (i = 0; i < SM3_MAX_LANES; i++) {
+ len = (int32_t) mgr->mgr.lens[i];
+ // len[i] in mgr consists of byte_length<<4 | lane_index
+ len = (len >= 16) ? (len >> 4 << 6) : 0;
+ printf("\t%d", len);
+ if (last_lens[i] > 0 && last_lens[i] == len)
+ num_unchanged += 1;
+ last_lens[i] = len;
+ }
+ printf("\n");
+ return num_unchanged;
+}
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ uint8_t num_ret, num_unchanged = 0;
+ int ret;
+
+ printf("sm3_mb flush test, %d buffers with %d length: \n", TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ lens[i] = TEST_LEN / SM3_MAX_LANES * (i + 1);
+ bufs[i] = (unsigned char *)malloc(lens[i]);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], lens[i]);
+ }
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+
+ // Run sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ printf("Changes of lens inside mgr:\n");
+ lens_print_and_check(mgr);
+ while (sm3_ctx_mgr_flush(mgr)) {
+ num_ret = lens_print_and_check(mgr);
+ num_unchanged = num_unchanged > num_ret ? num_unchanged : num_ret;
+ }
+ printf("Info of sm3_mb lens prints over\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf("Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..a2319ba14
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_datastruct.asm
@@ -0,0 +1,77 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SM3 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SM3_ARGS_X16
+;;; name size align
+FIELD _digest, 4*8*16, 4 ; transposed digest
+FIELD _data_ptr, 8*16, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SM3_ARGS_X4_size _FIELD_OFFSET
+%assign _SM3_ARGS_X4_align _STRUCT_ALIGN
+%assign _SM3_ARGS_X8_size _FIELD_OFFSET
+%assign _SM3_ARGS_X8_align _STRUCT_ALIGN
+%assign _SM3_ARGS_X16_size _FIELD_OFFSET
+%assign _SM3_ARGS_X16_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SM3_ARGS_X4_size, _SM3_ARGS_X4_align
+FIELD _lens, 4*16, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..b87bdcba8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx2.asm
@@ -0,0 +1,258 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sm3_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sm3_mb_x8_avx2
+%define idx rbp
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SM3_JOB* sm3_mb_mgr_flush_avx2(SM3_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sm3_mb_mgr_flush_avx2, function
+sm3_mb_mgr_flush_avx2:
+ endbranch
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; use num_lanes_inuse to judge all lanes are empty
+ cmp dword [state + _num_lanes_inuse], 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+mb_processing:
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sm3_mb_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..7feada49f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_flush_avx512.asm
@@ -0,0 +1,276 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sm3_mb_x16_avx512
+;extern sm3_opt_x1
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define arg1 rdi ; rcx
+ %define arg2 rsi ; rdx
+ %define tmp4 rdx
+%else
+ %define arg1 rcx
+ %define arg2 rdx
+ %define tmp4 rsi
+%endif
+
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define idx rbp
+
+%define num_lanes_inuse r9
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+
+; SM3_JOB* sm3_mb_mgr_flush_avx512(SM3_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+mk_global sm3_mb_mgr_flush_avx512, function
+sm3_mb_mgr_flush_avx512:
+ endbranch
+
+ ; Save the stack
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ ; flush may check here and call x1
+
+mb_processing:
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sm3_mb_x16_avx512
+ ; state and idx are intact
+
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+
+; return back stack
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sm3_mb_mgr_flush_avx512
+no_sm3_mb_mgr_flush_avx512:
+%endif
+
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..ae95faa89
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx2.asm
@@ -0,0 +1,247 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "memcpy.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sm3_mb_x8_avx2
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; SM3_JOB* sm3_mb_mgr_submit_avx2(SM3_MB_JOB_MGR *state, SM3_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+mk_global sm3_mb_mgr_submit_avx2, function
+sm3_mb_mgr_submit_avx2:
+ endbranch
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*8], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*8], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*8], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*8], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*8], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*8], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*8], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*8], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ add dword [state + _num_lanes_inuse], 1
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sm3_mb_x8_avx2
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ sub dword [state + _num_lanes_inuse], 1
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..7b7b21287
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_mgr_submit_avx512.asm
@@ -0,0 +1,273 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_job.asm"
+%include "memcpy.asm"
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+;
+; SM3_JOB* sm3_mb_mgr_submit_avx512 (SM3_MB_JOB_MGR *state, SM3_JOB* job);
+;
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+;todo sm3_mb_x16_avx512
+extern sm3_mb_x16_avx512
+
+[bits 64]
+default rel
+section .text
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define arg1 rdi ; state
+%define arg2 rsi ; job
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx ; state
+%define arg2 rdx ; job
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2 ;
+%define len2 arg2 ; + offset
+%define p2 arg2 ; need + offset
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+%define num_lanes_inuse r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+; todo make sure
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+mk_global sm3_mb_mgr_submit_avx512, function
+sm3_mb_mgr_submit_avx512:
+ endbranch
+
+ ; save these registers
+ sub rsp, STACK_SPACE
+ ; rsp contain stack ptr , mov to stack bottom
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp ; unuse 1 2
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+ ;mov rbx,rbp,r12,r13,r14,r15 to stack
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ ; mov args to rbx and then mov rbx to rbp
+ ; unused_lanes - rbx , lane - rbp both have already backup
+ and lane, 0xF
+ ; unless lane is 0x789abcdef, and return 0
+
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*16], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*16], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*16], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*16], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ ; eq jump
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sm3_mb_x16_avx512
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+; restore stack
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sm3_mb_mgr_submit_avx512
+no_sm3_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c
new file mode 100644
index 000000000..b904ba0ca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_ssl_test.c
@@ -0,0 +1,160 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+static void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ int ret;
+
+ printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sm3_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Random buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ sm3_ossl(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c
new file mode 100644
index 000000000..3671a3b79
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_test.c
@@ -0,0 +1,206 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+static void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+ int ret;
+
+ printf("multibinary_sm3 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]);
+
+ // Run sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sm3_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+
+ // Run sm3_mb test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sm3_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+
+ // sb_sm3 test
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c
new file mode 100644
index 000000000..64e583ffc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_rand_update_test.c
@@ -0,0 +1,298 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SM3_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SM3_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+// Generates pseudo-random data
+static void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+ int ret;
+
+ printf("multibinary_sm3_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sm3_ossl(bufs[i], TEST_LEN, digest_ref[i]);
+ }
+
+ // Run sb_sm3 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sm3_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sm3_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sm3_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sm3_ossl(bufs[i], lens[i], digest_ref[i]);
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ // Run sm3_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SM3_BLOCK_SIZE +
+ SM3_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SM3_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem, HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sm3_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sm3_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SM3_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sm3_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ref[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c
new file mode 100644
index 000000000..c409530c7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_test.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sm3_mb.h"
+
+typedef struct {
+ const char *msg;
+ uint32_t resultDigest[SM3_DIGEST_NWORDS];
+} TestData;
+
+static TestData test_data[] = {
+ {
+ .msg = "abc",
+ .resultDigest = {0xf4f0c766, 0xd9edee62, 0x6bd4f2d1, 0xe2e410dc,
+ 0x87c46741, 0xa2f7f25c, 0x2ba07d29, 0xe0a84b8f}
+ },
+ {
+ .msg = "abcdabcdabcdabcdabcdabcdabcdabcd" "abcdabcdabcdabcdabcdabcdabcdabcd",
+ .resultDigest = {0xf99fbede, 0xa1b87522, 0x89486038, 0x4d5a8ec1,
+ 0xe570db6f, 0x65577e38, 0xa3cb3d29, 0x32570c9c}
+
+ },
+ {
+ .msg = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq",
+ .resultDigest = {0xc56c9b63, 0x379e4de6, 0x92b190a3, 0xeaa14fdf,
+ 0x74ab2007, 0xb992f67f, 0x664e8cf3, 0x058c7bad}
+ },
+
+ {.msg = "0123456789:;<=>?@ABCDEFGHIJKLMNO",
+ .resultDigest = {0x076833d0, 0xd089ec39, 0xad857685, 0x8089797a,
+ 0x9df9e8fd, 0x4126eb9a, 0xf38c22e8, 0x054bb846}},
+ {
+ .msg =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<",
+ .resultDigest = {0x6cb9d38e, 0x846ac99e, 0x6d05634b, 0x3fe1bb26,
+ 0x90368c4b, 0xee8c4299, 0x08c0e96a, 0x2233cdc7}
+ },
+ {
+ .msg =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR",
+ .resultDigest = {0x83758189, 0x050f14d1, 0x91d8a730, 0x4a2825e4,
+ 0x11723273, 0x2114ee3f, 0x18cac172, 0xa9c5b07a}
+ },
+ {
+ .msg =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?",
+ .resultDigest = {0xb80f8aba, 0x55e96119, 0x851ac77b, 0xae31b3a5,
+ 0x1333e764, 0xc86ac40d, 0x34878db1, 0x7da873f6},
+ },
+ {
+ .msg =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU",
+ .resultDigest = {0xbd5736a7, 0x55977d13, 0xa950c78a, 0x71eeb7cb,
+ 0xe9ef0ba5, 0x95a9302e, 0x155e5c33, 0xad96ce3c}
+ },
+ {
+ .msg = "",
+ .resultDigest = {0x831db21a, 0x7fa1cf55, 0x4819618e, 0x8f1ae831,
+ 0xc7c8be22, 0x74fbfe28, 0xeb35d07e, 0x2baa8250}
+
+ },
+
+};
+
+#define MSGS sizeof(test_data)/sizeof(TestData)
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+int main(void)
+{
+
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+ int ret;
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sm3_ctx_mgr_init(mgr);
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i], test_data[i].msg,
+ strlen((char *)test_data[i].msg), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = test_data[t].resultDigest;
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sm3_ctx_mgr_flush(mgr);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = test_data[t].resultDigest;
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ test_data[j].msg, strlen((char *)test_data[j].msg),
+ HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = test_data[k].resultDigest;
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sm3_ctx_mgr_flush(mgr);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = test_data[k].resultDigest;
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sm3 test: Pass\n");
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..ed4d9a092
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_perf.c
@@ -0,0 +1,128 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sm3_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sm3_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sm3" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sm3 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c
new file mode 100644
index 000000000..025fd90ed
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_vs_ossl_shortage_perf.c
@@ -0,0 +1,133 @@
+/**********************************************************************
+ Copyright(c) 2020 Arm Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Arm Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS SM3_MAX_LANES
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+extern void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest);
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SM3_DIGEST_NWORDS];
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ uint32_t nlanes;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ int ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if (ret) {
+ printf("alloc error: Fail");
+ return -1;
+ }
+ sm3_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sm3_ossl(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sm3_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb shortage tests
+ for (nlanes = TEST_BUFS; nlanes > 0; nlanes--) {
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < nlanes; i++)
+ sm3_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN,
+ HASH_ENTIRE);
+
+ while (sm3_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sm3" TEST_TYPE_STR " with %d lanes: ", nlanes);
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < nlanes; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ to_le32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ }
+
+ printf("Multi-buffer sm3 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sm3_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm
new file mode 100644
index 000000000..3b300fa80
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x16_avx512.asm
@@ -0,0 +1,1035 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+[bits 64]
+default rel
+section .text
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, 8*64, 64
+FIELD _rsp, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) mk_global func_name, function, internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr) ; rdi + 8*16
+%define DIGEST state ; rdi
+%define SIZE num_blks ; rsi
+
+%define IDX var1
+%define TBL var2
+
+%define APPEND(a,b) a %+ b
+
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+
+;
+; 4 ZMM for tmp data
+;
+%define TMP0 zmm8
+%define TMP1 zmm9
+%define TMP2 zmm10
+%define TMP3 zmm11
+
+;
+; Word W[] will be expand to array size 64
+; Word WB[] will be expand to array size 68
+; WB[j] :
+; tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15);
+; WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6];
+; W[j]:
+; W[j] = WB[j] xor WB[j+4]
+;
+; so we used zmm12~31 20 numbers ZMM to keep WB
+; it is because once we calc W[j] value, we need
+; WB[j - 16] to WB[j + 4] , it is 20 WB number.
+;
+; And also we keep the lane into ZMM12~ZMM27
+; once we calc WB value, lane will not work
+;
+%define WB0 zmm12
+%define WB1 zmm13
+%define WB2 zmm14
+%define WB3 zmm15
+%define WB4 zmm16
+%define WB5 zmm17
+%define WB6 zmm18
+%define WB7 zmm19
+
+%define WB8 zmm20
+%define WB9 zmm21
+%define WB10 zmm22
+%define WB11 zmm23
+%define WB12 zmm24
+%define WB13 zmm25
+%define WB14 zmm26
+%define WB15 zmm27
+
+%define WB16 zmm28
+%define WB17 zmm29
+%define WB18 zmm30
+%define WB19 zmm31
+
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+;
+; same as sha256
+;
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+ vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+ ;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+
+%macro ROTATE_ARGS 0
+ %xdefine TMP_ D
+ %xdefine D C
+ %xdefine C B
+ %xdefine B A
+ %xdefine A TMP3
+ %xdefine TMP3 TMP_
+
+ %xdefine TMP2_ H
+ %xdefine H G
+ %xdefine G F
+ %xdefine F E
+ %xdefine E TMP0
+ %xdefine TMP0 TMP2_
+%endmacro
+
+;
+; P() Save in TMP0
+; used TMP1
+%macro P 1
+%define %%A %1
+ vprold TMP0,%%A,9
+ vprold TMP1,%%A,17
+
+ vpternlogd TMP0,TMP1,%%A,0x96
+
+%endmacro
+
+;
+; P1() Save in TMP0
+; used TMP1
+%macro P1 1
+%define %%A %1
+
+ vprold TMP0,%%A,15
+ vprold TMP1,%%A,23
+
+ vpternlogd TMP0,TMP1,%%A,0x96
+%endmacro
+
+;
+; FF_16() Save in TMP0
+;
+%macro FF_16 3
+%define %%X %1
+%define %%Y %2
+%define %%Z %3
+ ; I < 16 return (X ^ Y ^ Z)
+ vmovups TMP0,%%X
+ vpternlogd TMP0,%%Y,%%Z,0x96
+%endmacro
+
+
+;
+; FF_64() Save in TMP0
+; used TMP1
+%macro FF_64 3
+
+%define %%X %1
+%define %%Y %2
+%define %%Z %3
+ ; I > 16 return (x & y) | (x & z) | (y & z)
+ ; Same as (x & y) | (z & (x | y))
+ vporq TMP0,%%X,%%Y
+ vpandq TMP0,%%Z
+ vpandq TMP1,%%X,%%Y
+ vporq TMP0,TMP1
+%endmacro
+
+
+;
+; GG() Save in TMP0
+; used TMP1
+%macro GG_16 3
+%define %%X %1
+%define %%Y %2
+%define %%Z %3
+ ; I < 16 return (x ^ y ^ z)
+ vmovups TMP0,%%X
+ vpternlogd TMP0,%%Y,%%Z,0x96
+%endmacro
+
+%macro GG_64 3
+
+%define %%X %1
+%define %%Y %2
+%define %%Z %3
+
+ ; I > 16 return (x & y) | ((~x) & z)
+ vpandq TMP0,%%X,%%Y
+ vpandnd TMP1,%%X,%%Z
+ vporq TMP0,TMP1
+%endmacro
+
+;; void sm3_mb_x16_avx512(SM3_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sm3_mb_x16_avx512)
+sm3_mb_x16_avx512:
+ endbranch
+
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _rsp], rax
+
+ lea TBL, [TABLE]
+
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*64] ; mov unsigned
+ vmovups B, [DIGEST + 1*64]
+ vmovups C, [DIGEST + 2*64]
+ vmovups D, [DIGEST + 3*64]
+ vmovups E, [DIGEST + 4*64]
+ vmovups F, [DIGEST + 5*64]
+ vmovups G, [DIGEST + 6*64]
+ vmovups H, [DIGEST + 7*64]
+
+ xor IDX, IDX
+
+%assign cur_loop 0
+lloop:
+ ;; start message expand
+ ;; Transpose input data
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+ ;; stored B(i) to W(1)...W(15)
+ ;; in zmm16....zmm31
+
+ vmovups WB0,[inp0+IDX]
+ vmovups WB1,[inp1+IDX]
+ vmovups WB2,[inp2+IDX]
+ vmovups WB3,[inp3+IDX]
+ vmovups WB4,[inp4+IDX]
+ vmovups WB5,[inp5+IDX]
+ vmovups WB6,[inp6+IDX]
+ vmovups WB7,[inp7+IDX]
+
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+
+ vmovups WB8, [inp0+IDX]
+ vmovups WB9, [inp1+IDX]
+ vmovups WB10,[inp2+IDX]
+ vmovups WB11,[inp3+IDX]
+ vmovups WB12,[inp4+IDX]
+ vmovups WB13,[inp5+IDX]
+ vmovups WB14,[inp6+IDX]
+ vmovups WB15,[inp7+IDX]
+
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 64
+
+ ; flat shuffle
+ TRANSPOSE16 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, TMP0, TMP1
+
+ ; little endian to big endian
+ vmovdqa32 TMP0, [SHUF_MASK]
+ vpshufb WB0,TMP0
+ vpshufb WB1,TMP0
+ vpshufb WB2,TMP0
+ vpshufb WB3,TMP0
+ vpshufb WB4,TMP0
+ vpshufb WB5,TMP0
+ vpshufb WB6,TMP0
+ vpshufb WB7,TMP0
+ vpshufb WB8,TMP0
+ vpshufb WB9,TMP0
+ vpshufb WB10,TMP0
+ vpshufb WB11,TMP0
+ vpshufb WB12,TMP0
+ vpshufb WB13,TMP0
+ vpshufb WB14,TMP0
+ vpshufb WB15,TMP0
+
+%assign I 0
+%rep 12
+ %assign J I+4
+
+ ; (A <<< 12)
+ ; store in TMP0
+ vprold TMP0,A,12
+
+ ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7
+ ; (T(j) <<< j) store in TBL
+ ; SS1 store in TMP2
+ vmovdqa32 TMP2, [TBL + (I*64)]
+ vpaddd TMP2,E
+
+ vpaddd TMP2,TMP0
+ vprold TMP2,7
+
+ ; SS2 = SS1 ^ (A <<< 12)
+ ; SS2 store in TMP3
+ vpxord TMP3,TMP2,TMP0
+
+ ; TT2 = GG(E,F,G) + H + SS1 + WB(I)
+ GG_16 E,F,G
+ vpaddd TMP2,TMP0
+ vpaddd TMP2,H
+
+ vpaddd TMP2,APPEND(WB,I)
+
+ ; TT1 = FF(A,B,C) + D + SS2 + W(I)
+ ; TT1 store in TMP3
+ FF_16 A,B,C
+ vpaddd TMP3,TMP0
+ vpaddd TMP3,D
+ ; W(I) = WB(I) ^ W(I+4)
+ vpxord TMP0,APPEND(WB,I),APPEND(WB,J)
+ vpaddd TMP3,TMP0
+
+
+ ; D = C
+ ; C = B <<< 9
+ ; B = A
+ ; A = TT1
+ ; H = G
+ ; G = F <<< 19
+ ; F = E
+ ; E = P(TT2)
+ vmovups D,C
+ vprold B,9
+ vmovups C,B
+ vmovups B,A
+ vmovups A,TMP3
+ vmovups H,G
+ vprold F,19
+ vmovups G,F
+ vmovups F,E
+ P TMP2
+ vmovups E,TMP0
+
+ ;vprold B,9
+ ;vprold F,19
+ ;P TMP2
+ ;ROTATE_ARGS
+
+ %assign I (I+1)
+%endrep
+
+
+;tmp = WB[j - 16] ^ WB[j - 9] ^ rol32(WB[j - 3], 15);
+;WB[j] = P1(tmp) ^ (rol32(WB[j - 13], 7)) ^ WB[j - 6];
+
+; round 12-16 here
+%rep 4
+ %assign J I+4
+
+ %assign J_3 J-3
+ %assign J_16 J-16
+ %assign J_9 J-9
+ %assign J_13 J-13
+ %assign J_6 J-6
+
+ ; clac WB(I+4)
+ vprold APPEND(WB,J),APPEND(WB,J_3),15
+ vpxord APPEND(WB,J),APPEND(WB,J_16)
+ vpxord APPEND(WB,J),APPEND(WB,J_9)
+
+ P1 APPEND(WB,J)
+
+ vprold APPEND(WB,J),APPEND(WB,J_13),7
+ vpxord APPEND(WB,J),TMP0
+ vpxord APPEND(WB,J),APPEND(WB,J_6)
+
+ ; (A <<< 12)
+ ; store in TMP0
+ vprold TMP0,A,12
+
+ ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7
+ ; (T(j) <<< j) store in TBL
+ ; SS1 store in TMP2
+ vmovdqa32 TMP2, [TBL + (I*64)]
+ vpaddd TMP2,E
+
+ vpaddd TMP2,TMP0
+ vprold TMP2,7
+
+ ; SS2 = SS1 ^ (A <<< 12)
+ ; SS2 store in TMP3
+ vpxord TMP3,TMP2,TMP0
+
+ ; TT2 = GG(E,F,G) + H + SS1 + WB(I)
+ GG_16 E,F,G
+ vpaddd TMP2,TMP0
+ vpaddd TMP2,H
+
+ vpaddd TMP2,APPEND(WB,I)
+
+ ; TT1 = FF(A,B,C) + D + SS2 + W(I)
+ ; TT1 store in TMP3
+ FF_16 A,B,C
+ vpaddd TMP3,TMP0
+ vpaddd TMP3,D
+ ; W(I) = WB(I) ^ W(I+4)
+ vpxord TMP0,APPEND(WB,I),APPEND(WB,J)
+ vpaddd TMP3,TMP0
+
+ ; D = C
+ ; C = B <<< 9
+ ; B = A
+ ; A = TT1
+ ; H = G
+ ; G = F <<< 19
+ ; F = E
+ ; E = P(TT2)
+ vmovups D,C
+ vprold B,9
+ vmovups C,B
+ vmovups B,A
+ vmovups A,TMP3
+ vmovups H,G
+ vprold F,19
+ vmovups G,F
+ vmovups F,E
+ P TMP2
+ vmovups E,TMP0
+
+ %assign I (I+1)
+%endrep
+
+%rep 48
+ %assign J (((I+4) % 20) + 20)
+
+ %assign J_3 ((J-3) % 20)
+ %assign J_16 ((J-16) % 20)
+ %assign J_9 ((J-9) % 20)
+ %assign J_13 ((J-13) % 20)
+ %assign J_6 ((J-6) % 20)
+
+ %assign I_20 (I % 20)
+ %assign J (((I+4) % 20))
+
+ vprold APPEND(WB,J),APPEND(WB,J_3),15
+ vpxord APPEND(WB,J),APPEND(WB,J_16)
+ vpxord APPEND(WB,J),APPEND(WB,J_9)
+
+ P1 APPEND(WB,J)
+
+ vprold APPEND(WB,J),APPEND(WB,J_13),7
+ vpxord APPEND(WB,J),TMP0
+ vpxord APPEND(WB,J),APPEND(WB,J_6)
+
+ ; (A <<< 12)
+ ; store in TMP0
+ vprold TMP0,A,12
+
+ ; SS1 = ((A <<< 12) + E + (T(j) <<< j)) <<< 7
+ ; (T(j) <<< j) store in TBL
+ ; SS1 store in TMP2
+ vmovdqa32 TMP2, [TBL + (I*64)]
+ vpaddd TMP2,E
+
+ vpaddd TMP2,TMP0
+ vprold TMP2,7
+
+ ; SS2 = SS1 ^ (A <<< 12)
+ ; SS2 store in TMP3
+ vpxord TMP3,TMP2,TMP0
+
+ ; TT2 = GG(E,F,G) + H + SS1 + WB(I)
+ GG_64 E,F,G
+ vpaddd TMP2,TMP0
+ vpaddd TMP2,H
+
+ vpaddd TMP2,APPEND(WB,I_20)
+
+ ; TT1 = FF(A,B,C) + D + SS2 + W(I)
+ ; TT1 store in TMP3
+ FF_64 A,B,C
+ vpaddd TMP3,TMP0
+ vpaddd TMP3,D
+ ; W(I) = WB(I) ^ W(I+4)
+ vpxord TMP0,APPEND(WB,I_20),APPEND(WB,J)
+ vpaddd TMP3,TMP0
+
+ ; D = C
+ ; C = B <<< 9
+ ; B = A
+ ; A = TT1
+ ; H = G
+ ; G = F <<< 19
+ ; F = E
+ ; E = P(TT2)
+ vmovups D,C
+ vprold B,9
+ vmovups C,B
+ vmovups B,A
+ vmovups A,TMP3
+ vmovups H,G
+ vprold F,19
+ vmovups G,F
+ vmovups F,E
+ P TMP2
+ vmovups E,TMP0
+
+ %assign I (I+1)
+%endrep
+ ; Xor old digest
+ vpxord A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpxord B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpxord C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpxord D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpxord E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpxord F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpxord G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpxord H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ %assign cur_loop cur_loop+1
+ sub SIZE, 1
+ je last_loop
+
+ jmp lloop
+
+
+last_loop:
+
+%assign I 0
+%rep 8
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+ ; Write out digest
+ vmovups [DIGEST + 0*64], A
+ vmovups [DIGEST + 1*64], B
+ vmovups [DIGEST + 2*64], C
+ vmovups [DIGEST + 3*64], D
+ vmovups [DIGEST + 4*64], E
+ vmovups [DIGEST + 5*64], F
+ vmovups [DIGEST + 6*64], G
+ vmovups [DIGEST + 7*64], H
+
+
+ mov rsp, [rsp + _rsp]
+ ret
+
+
+section .data
+align 64
+TABLE:
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+
+
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b
+ dq 0x0405060700010203,0x0c0d0e0f08090a0b
+ dq 0x0405060700010203,0x0c0d0e0f08090a0b
+ dq 0x0405060700010203,0x0c0d0e0f08090a0b
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sm3_mb_x16_avx512
+no_sm3_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm
new file mode 100644
index 000000000..0c2c9cdee
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_mb_x8_avx2.asm
@@ -0,0 +1,711 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sm3_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+[bits 64]
+default rel
+section .text
+
+;; code to compute oct SM3 using SSE-256 / AVX2
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rsi, rdi, r9-r15; eax;ymm0-15
+;; Windows clobbers: rax rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rsi
+ %define reg4 rdi
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+%define SIZE INP_SIZE ; rsi
+
+%define IDX rax
+%define TBL reg3
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg4
+
+%define APPEND(a,b) a %+ b
+
+%define WB0 ymm0
+%define WB1 ymm1
+%define WB2 ymm2
+%define WB3 ymm3
+%define WB4 ymm4
+%define WB5 ymm5
+%define WB6 ymm6
+%define WB7 ymm7
+%define WB8 ymm8
+%define WB9 ymm9
+%define WB10 ymm10
+%define WB11 ymm11
+%define WB12 ymm12
+%define WB13 ymm13
+%define WB14 ymm14
+%define WB15 ymm15
+
+%define WBTMP0 ymm8
+%define WBTMP1 ymm9
+
+%define WBTMP2 ymm0
+%define WBTMP3 ymm1
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+%define F ymm5
+%define G ymm6
+%define H ymm7
+
+%define TMP0 ymm8
+%define TMP1 ymm9
+%define TMP2 ymm10
+
+; W(j) = WB(j) + WB(j+4)
+; Keep WB(j) - W(j+4) to reduce momory read
+%define Wj0 ymm11
+%define Wj1 ymm12
+%define Wj2 ymm13
+%define Wj3 ymm14
+%define Wj4 ymm15
+
+
+%define SZ8 8*SM3_DIGEST_WORD_SIZE ; Size of one vector register
+%define PTR_SZ 8
+%define SM3_DIGEST_WORD_SIZE 4
+%define MAX_SM3_LANES 8
+%define NUM_SM3_DIGEST_WORDS 8
+%define SM3_DIGEST_ROW_SIZE (MAX_SM3_LANES * SM3_DIGEST_WORD_SIZE)
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+ .data resb 16*SZ8
+ .digest resb 8*SZ8
+ .wbtmp resb 69*SZ8
+ .rsp resb 8
+endstruc
+%define FRAMESZ stack_frame_size
+%define _DIGEST stack_frame.digest
+%define _WBTMP stack_frame.wbtmp
+%define _RSP_SAVE stack_frame.rsp
+
+%define YTMP0 rsp + _WBTMP + 0*SZ8
+%define YTMP1 rsp + _WBTMP + 1*SZ8
+%define YTMP2 rsp + _WBTMP + 2*SZ8
+%define YTMP3 rsp + _WBTMP + 3*SZ8
+%define YTMP4 rsp + _WBTMP + 4*SZ8
+
+%define YTMPI rsp + _WBTMP + I*SZ8
+%define YTMPI_1 rsp + _WBTMP + (I - 1)*SZ8
+%define YTMPI_2 rsp + _WBTMP + (I - 2)*SZ8
+%define YTMPI_4 rsp + _WBTMP + (I - 4)*SZ8
+%define YTMPI5 rsp + _WBTMP + (I + 5)*SZ8
+
+
+%define VMOVPS vmovups
+
+;;;;;;;;
+; same as sha256
+;;;;;;;;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+%macro ROTATE_W 0
+
+ %xdefine TMP_ Wj0
+ %xdefine Wj0 Wj1
+ %xdefine Wj1 Wj2
+ %xdefine Wj2 Wj3
+ %xdefine Wj3 Wj4
+
+ %xdefine Wj4 TMP_
+
+%endmacro
+
+; ROTATE A,B,C,D
+%macro ROTATE_ARGS_AD 0
+
+ %xdefine TMP_ D
+ %xdefine D C
+ %xdefine C B
+ %xdefine B A
+ %xdefine A TMP2
+ %xdefine TMP2 TMP_
+
+%endmacro
+
+%macro ROTATE_ARGS_EH 0
+
+ %xdefine TMP_ H
+ %xdefine H G
+ %xdefine G F
+ %xdefine F E
+ %xdefine E TMP0
+ %xdefine TMP0 TMP_
+
+%endmacro
+
+%macro ROLD 3
+
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, %%imm
+ vpsrld %%reg, %%reg, (32-(%%imm))
+ vpor %%reg, %%reg, %%tmp
+
+%endmacro
+
+%macro ROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, %%imm
+ vpsrld %%reg, %%src, (32-(%%imm))
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;; void sm3_x8_avx2(SM3_ARGS *args, uint64_t bytes);
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE : size of input in blocks
+mk_global sm3_mb_x8_avx2,function,internal
+align 16
+sm3_mb_x8_avx2:
+ endbranch
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the YMM registers
+
+ ; save rsp, allocate 32-byte aligned for local variables
+ mov IDX, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~31
+ mov [rsp + _RSP_SAVE], IDX
+
+ lea TBL,[TABLE]
+
+ ;; load the address of each of the 8 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
+ mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
+ mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
+ mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
+ mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
+ mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
+ mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
+ mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
+
+ xor IDX, IDX
+
+%assign cur_loop 0
+lloop:
+
+ ;
+ ; Pre calculate the WB 0..68 an W 0..64
+ ; It will better than calculate WB/W in round method
+ ;
+ ; ps : SHA256(AVX2) calculate WB/W in round method
+ ;
+ ; Pre calculation memory io time:
+ ; read : 68 + 3 * 52(read WB)
+ ; write : 52(write WB17..68)
+ ; Round method calculation memory io time:
+ ; read : 48 * 6(read 6 number of WB each round)
+ ; write : 52 + 64(same as upper)
+ ;
+ VMOVPS WB0,[inp0+IDX]
+ VMOVPS WB1,[inp1+IDX]
+ VMOVPS WB2,[inp2+IDX]
+ VMOVPS WB3,[inp3+IDX]
+ VMOVPS WB4,[inp4+IDX]
+ VMOVPS WB5,[inp5+IDX]
+ VMOVPS WB6,[inp6+IDX]
+ VMOVPS WB7,[inp7+IDX]
+
+ TRANSPOSE8 WB0, WB1, WB2, WB3, WB4, WB5, WB6, WB7, WBTMP0, WBTMP1
+ vmovdqa WBTMP0, [SHUF_MASK]
+ vpshufb WB0,WBTMP0
+ vpshufb WB1,WBTMP0
+ vpshufb WB2,WBTMP0
+ vpshufb WB3,WBTMP0
+ vpshufb WB4,WBTMP0
+ vpshufb WB5,WBTMP0
+ vpshufb WB6,WBTMP0
+ vpshufb WB7,WBTMP0
+
+ vmovdqa [YTMP0], WB0
+ vmovdqa [YTMP1], WB1
+
+ VMOVPS WB8,[inp0+IDX + 32]
+ VMOVPS WB9,[inp1+IDX + 32]
+ VMOVPS WB10,[inp2+IDX + 32]
+ VMOVPS WB11,[inp3+IDX + 32]
+ VMOVPS WB12,[inp4+IDX + 32]
+ VMOVPS WB13,[inp5+IDX + 32]
+ VMOVPS WB14,[inp6+IDX + 32]
+ VMOVPS WB15,[inp7+IDX + 32]
+
+ TRANSPOSE8 WB8, WB9, WB10, WB11, WB12, WB13, WB14, WB15, WBTMP2, WBTMP3
+ vmovdqa WBTMP2, [SHUF_MASK]
+ vpshufb WB8,WBTMP2
+ vpshufb WB9,WBTMP2
+ vpshufb WB10,WBTMP2
+ vpshufb WB11,WBTMP2
+ vpshufb WB12,WBTMP2
+ vpshufb WB13,WBTMP2
+ vpshufb WB14,WBTMP2
+ vpshufb WB15,WBTMP2
+
+; WB0 WB1 already saved
+%assign I 2
+%rep 14
+ vmovdqa [YTMPI], APPEND(WB,I)
+%assign I (I+1)
+%endrep
+
+ vmovdqa WB0 , [YTMP0]
+ vmovdqa WB1 , [YTMP1]
+
+; Calculate WB 16...67
+%rep 52
+ %assign J (I % 16)
+ %assign J_1 ((I-1) % 16) ;tmp to use
+ %assign J_2 ((I-2) % 16) ;tmp to use
+ %assign J_3 ((I-3) % 16)
+ %assign J_4 ((I-4) % 16) ;tmp to use
+ %assign J_9 ((I-9) % 16)
+ %assign J_13 ((I-13) % 16)
+ %assign J_6 ((I-6) % 16)
+
+ ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J_3)
+ vpxor APPEND(WB,J),APPEND(WB,J_2)
+ vpxor APPEND(WB,J),APPEND(WB,J_9)
+
+ ROLD_nd APPEND(WB,J_2),15,APPEND(WB,J_1),APPEND(WB,J)
+ ROLD_nd APPEND(WB,J_1),23,APPEND(WB,J_4),APPEND(WB,J)
+ vpxor APPEND(WB,J),APPEND(WB,J_2)
+ vpxor APPEND(WB,J),APPEND(WB,J_1)
+
+ ROLD_nd APPEND(WB,J_2),7,APPEND(WB,J_1),APPEND(WB,J_13)
+ vpxor APPEND(WB,J),APPEND(WB,J_2)
+ vpxor APPEND(WB,J),APPEND(WB,J_6)
+
+ vmovdqa [YTMPI], APPEND(WB,J)
+
+ vmovdqa APPEND(WB,J_1), [YTMPI_1]
+ vmovdqa APPEND(WB,J_2), [YTMPI_2]
+ vmovdqa APPEND(WB,J_4), [YTMPI_4]
+
+ %assign I (I+1)
+%endrep
+
+ add IDX, 4*4*4
+
+ ; Every round need load A-H
+ ; Because we pre calculate the WB
+ vmovdqu A,[STATE + 0*SM3_DIGEST_ROW_SIZE]
+ vmovdqu B,[STATE + 1*SM3_DIGEST_ROW_SIZE]
+ vmovdqu C,[STATE + 2*SM3_DIGEST_ROW_SIZE]
+ vmovdqu D,[STATE + 3*SM3_DIGEST_ROW_SIZE]
+ vmovdqu E,[STATE + 4*SM3_DIGEST_ROW_SIZE]
+ vmovdqu F,[STATE + 5*SM3_DIGEST_ROW_SIZE]
+ vmovdqu G,[STATE + 6*SM3_DIGEST_ROW_SIZE]
+ vmovdqu H,[STATE + 7*SM3_DIGEST_ROW_SIZE]
+
+ vmovdqa Wj0, [YTMP0]
+ vmovdqa Wj1, [YTMP1]
+ vmovdqa Wj2, [YTMP2]
+ vmovdqa Wj3, [YTMP3]
+ vmovdqa Wj4, [YTMP4]
+
+
+%assign I 0
+%rep 16
+
+ ; SS1 - TMP1
+ ROLD_nd TMP0,12,TMP1,A
+ vmovdqa TMP1, [TBL + (I*32)]
+ vpaddd TMP1,E
+ vpaddd TMP1,TMP0
+ ROLD TMP1,7,TMP2
+
+ ; SS2 - TMP2
+ vpxor TMP2,TMP1,TMP0
+
+ ; TT1
+ vpxor TMP0,A,B
+ vpxor TMP0,C
+ vpaddd TMP2,TMP0
+ vpaddd TMP2,D
+ vpxor TMP0,Wj0,Wj4
+ vpaddd TMP2,TMP0
+
+ ROLD B,9,TMP0
+
+ ; Rotate a,b,c,d first
+ ; after P0(TT2) , Wj0 will be relase
+ ROTATE_ARGS_AD
+
+ ; P0(TT2)
+ vpxor TMP0,E,F
+ vpxor TMP0,G
+ vpaddd TMP0,H
+ vpaddd TMP0,TMP1
+ vpaddd TMP0,Wj0
+
+ ROLD_nd TMP1,9,TMP2,TMP0
+ ROLD_nd Wj0,17,TMP2,TMP0
+
+ vpxor TMP0,TMP1
+ vpxor TMP0,Wj0
+
+ ROLD F,19,TMP2
+
+ ROTATE_ARGS_EH
+
+ ROTATE_W
+
+ vmovdqa Wj4, [YTMPI5]
+ %assign I (I+1)
+%endrep
+
+%rep 48
+ ; SS1 - TMP1
+ ROLD_nd TMP0,12,TMP1,A
+ vmovdqa TMP1, [TBL + (I*32)]
+ vpaddd TMP1,E
+ vpaddd TMP1,TMP0
+ ROLD TMP1,7,TMP2
+
+ ; SS2 - TMP2
+ vpxor TMP2,TMP1,TMP0
+
+ ; SS2 + D first
+ ; D will be release
+ ; FF16/GG16 diff with FF64/GG64
+ ; So the register which keep D should be release before calculate TT1
+ vpaddd TMP2,D
+
+ ; TT1
+ vpor TMP0,A,B
+ vpand TMP0,C
+ vpand D,A,B
+ vpor TMP0,D
+
+ vpaddd TMP2,TMP0
+ vpxor TMP0,Wj0,Wj4
+ vpaddd TMP2,TMP0
+
+ ROLD B,9,TMP0
+
+ ROTATE_ARGS_AD
+
+ ; P0(TT2)
+ vpaddd TMP1,H
+ vpaddd TMP1,Wj0
+
+ vpand TMP0,E,F
+ vpandn Wj0,E,G
+ vpor TMP0,Wj0
+
+ vpaddd TMP0,TMP1
+
+ ROLD_nd TMP1,9,TMP2,TMP0
+ ROLD_nd Wj0,17,TMP2,TMP0
+
+ vpxor TMP0,TMP1
+ vpxor TMP0,Wj0
+
+ ROLD F,19,TMP2
+
+ ROTATE_ARGS_EH
+
+ ROTATE_W
+ vmovdqa Wj4, [YTMPI5]
+ %assign I (I+1)
+%endrep
+
+ vpxor A, A, [STATE + 0*SM3_DIGEST_ROW_SIZE]
+ vpxor B, B, [STATE + 1*SM3_DIGEST_ROW_SIZE]
+ vpxor C, C, [STATE + 2*SM3_DIGEST_ROW_SIZE]
+ vpxor D, D, [STATE + 3*SM3_DIGEST_ROW_SIZE]
+ vpxor E, E, [STATE + 4*SM3_DIGEST_ROW_SIZE]
+ vpxor F, F, [STATE + 5*SM3_DIGEST_ROW_SIZE]
+ vpxor G, G, [STATE + 6*SM3_DIGEST_ROW_SIZE]
+ vpxor H, H, [STATE + 7*SM3_DIGEST_ROW_SIZE]
+
+ ; Write back to memory (state object) the transposed digest
+ vmovdqu [STATE + 0*SM3_DIGEST_ROW_SIZE],A
+ vmovdqu [STATE + 1*SM3_DIGEST_ROW_SIZE],B
+ vmovdqu [STATE + 2*SM3_DIGEST_ROW_SIZE],C
+ vmovdqu [STATE + 3*SM3_DIGEST_ROW_SIZE],D
+ vmovdqu [STATE + 4*SM3_DIGEST_ROW_SIZE],E
+ vmovdqu [STATE + 5*SM3_DIGEST_ROW_SIZE],F
+ vmovdqu [STATE + 6*SM3_DIGEST_ROW_SIZE],G
+ vmovdqu [STATE + 7*SM3_DIGEST_ROW_SIZE],H
+
+ sub SIZE, 1
+ je last_loop
+ jmp lloop
+
+last_loop:
+
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _args_data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [STATE + _args_data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [STATE + _args_data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [STATE + _args_data_ptr + 3*8], inp3
+ add inp4, IDX
+ mov [STATE + _args_data_ptr + 4*8], inp4
+ add inp5, IDX
+ mov [STATE + _args_data_ptr + 5*8], inp5
+ add inp6, IDX
+ mov [STATE + _args_data_ptr + 6*8], inp6
+ add inp7, IDX
+ mov [STATE + _args_data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ mov rsp, [rsp + _RSP_SAVE]
+ ret
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+align 64
+global TABLE
+TABLE:
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0x79cc451979cc4519,0x79cc451979cc4519
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xf3988a32f3988a32,0xf3988a32f3988a32
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xe7311465e7311465,0xe7311465e7311465
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0xce6228cbce6228cb,0xce6228cbce6228cb
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x9cc451979cc45197,0x9cc451979cc45197
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x3988a32f3988a32f,0x3988a32f3988a32f
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0x7311465e7311465e,0x7311465e7311465e
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xe6228cbce6228cbc,0xe6228cbce6228cbc
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0xcc451979cc451979,0xcc451979cc451979
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x988a32f3988a32f3,0x988a32f3988a32f3
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x311465e7311465e7,0x311465e7311465e7
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0x6228cbce6228cbce,0x6228cbce6228cbce
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0xc451979cc451979c,0xc451979cc451979c
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x88a32f3988a32f39,0x88a32f3988a32f39
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x11465e7311465e73,0x11465e7311465e73
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x228cbce6228cbce6,0x228cbce6228cbce6
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0x7a879d8a7a879d8a,0x7a879d8a7a879d8a
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xf50f3b14f50f3b14,0xf50f3b14f50f3b14
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xea1e7629ea1e7629,0xea1e7629ea1e7629
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xd43cec53d43cec53,0xd43cec53d43cec53
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0xa879d8a7a879d8a7,0xa879d8a7a879d8a7
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0x50f3b14f50f3b14f,0x50f3b14f50f3b14f
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0xa1e7629ea1e7629e,0xa1e7629ea1e7629e
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x43cec53d43cec53d,0x43cec53d43cec53d
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x879d8a7a879d8a7a,0x879d8a7a879d8a7a
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x0f3b14f50f3b14f5,0x0f3b14f50f3b14f5
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x1e7629ea1e7629ea,0x1e7629ea1e7629ea
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x3cec53d43cec53d4,0x3cec53d43cec53d4
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0x79d8a7a879d8a7a8,0x79d8a7a879d8a7a8
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xf3b14f50f3b14f50,0xf3b14f50f3b14f50
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xe7629ea1e7629ea1,0xe7629ea1e7629ea1
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0xcec53d43cec53d43,0xcec53d43cec53d43
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x9d8a7a879d8a7a87,0x9d8a7a879d8a7a87
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x3b14f50f3b14f50f,0x3b14f50f3b14f50f
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0x7629ea1e7629ea1e,0x7629ea1e7629ea1e
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xec53d43cec53d43c,0xec53d43cec53d43c
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xd8a7a879d8a7a879,0xd8a7a879d8a7a879
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0xb14f50f3b14f50f3,0xb14f50f3b14f50f3
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0x629ea1e7629ea1e7,0x629ea1e7629ea1e7
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0xc53d43cec53d43ce,0xc53d43cec53d43ce
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x8a7a879d8a7a879d,0x8a7a879d8a7a879d
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x14f50f3b14f50f3b,0x14f50f3b14f50f3b
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x29ea1e7629ea1e76,0x29ea1e7629ea1e76
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0x53d43cec53d43cec,0x53d43cec53d43cec
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0xa7a879d8a7a879d8,0xa7a879d8a7a879d8
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x4f50f3b14f50f3b1,0x4f50f3b14f50f3b1
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x9ea1e7629ea1e762,0x9ea1e7629ea1e762
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+ dq 0x3d43cec53d43cec5,0x3d43cec53d43cec5
+
+SHUF_MASK: dq 0x0405060700010203,0x0c0d0e0f08090a0b
+ dq 0x0405060700010203,0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm
new file mode 100644
index 000000000..482876539
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_multibinary.asm
@@ -0,0 +1,81 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2020 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+extern sm3_ctx_mgr_init_base
+extern sm3_ctx_mgr_submit_base
+extern sm3_ctx_mgr_flush_base
+
+extern sm3_ctx_mgr_init_avx2
+extern sm3_ctx_mgr_submit_avx2
+extern sm3_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sm3_ctx_mgr_init_avx512
+ extern sm3_ctx_mgr_submit_avx512
+ extern sm3_ctx_mgr_flush_avx512
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sm3_ctx_mgr_init
+mbin_interface sm3_ctx_mgr_submit
+mbin_interface sm3_ctx_mgr_flush
+
+;; have not imlement see/avx yet
+%ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \
+ sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_base, sm3_ctx_mgr_init_avx2, \
+ sm3_ctx_mgr_init_avx512
+ mbin_dispatch_init6 sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \
+ sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_base, sm3_ctx_mgr_submit_avx2, \
+ sm3_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \
+ sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_base, sm3_ctx_mgr_flush_avx2, \
+ sm3_ctx_mgr_flush_avx512
+%else
+ mbin_dispatch_init sm3_ctx_mgr_init, sm3_ctx_mgr_init_base, \
+ sm3_ctx_mgr_init_base,sm3_ctx_mgr_init_avx2
+ mbin_dispatch_init sm3_ctx_mgr_submit, sm3_ctx_mgr_submit_base, \
+ sm3_ctx_mgr_submit_base,sm3_ctx_mgr_submit_avx2
+ mbin_dispatch_init sm3_ctx_mgr_flush, sm3_ctx_mgr_flush_base, \
+ sm3_ctx_mgr_flush_base,sm3_ctx_mgr_flush_avx2
+%endif
+
+;;; func core, ver, snum
+slversion sm3_ctx_mgr_init, 00, 00, 2300
+slversion sm3_ctx_mgr_submit, 00, 00, 2301
+slversion sm3_ctx_mgr_flush, 00, 00, 2302
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c
new file mode 100644
index 000000000..be56350b3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_ref_test.c
@@ -0,0 +1,207 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define ISAL_UNIT_TEST
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+
+typedef uint32_t digest_sm3[SM3_DIGEST_NWORDS];
+
+#define MSGS 2
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "abc";
+static uint8_t msg2[] = "abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd";
+
+/* small endian */
+static digest_sm3 exp_result_digest1 = { 0x66c7f0f4, 0x62eeedd9, 0xd1f2d46b, 0xdc10e4e2,
+ 0x4167c487, 0x5cf2f7a2, 0x297da02b, 0x8f4ba8e0
+};
+
+/* small endian */
+static digest_sm3 exp_result_digest2 = { 0xdebe9ff9, 0x2275b8a1, 0x38604889, 0xc18e5a4d,
+ 0x6fdb70e5, 0x387e5765, 0x293dcba3, 0x9c0c5732
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2 };
+
+static uint32_t *exp_result_digest[MSGS] = {
+ exp_result_digest1, exp_result_digest2
+};
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = exp_result_digest[t];
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j],
+ byteswap32(good[j]));
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sm3_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = exp_result_digest[t];
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j],
+ byteswap32(good[j]));
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sm3_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = exp_result_digest[k];
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j],
+ byteswap32(good[j]));
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+ }
+ }
+ while (1) {
+ ctx = sm3_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = exp_result_digest[k];
+ checked++;
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (byteswap32(good[j]) != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j],
+ byteswap32(good[j]));
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sm3 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c
new file mode 100644
index 000000000..4c0c54436
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sm3_mb/sm3_test_helper.c
@@ -0,0 +1,45 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <openssl/evp.h>
+
+void sm3_ossl(const unsigned char *buf, size_t length, unsigned char *digest)
+{
+ EVP_MD_CTX *md_ctx;
+ const EVP_MD *md;
+ unsigned int md_len;
+
+ md = EVP_sm3();
+ md_ctx = EVP_MD_CTX_new();
+ EVP_DigestInit_ex(md_ctx, md, NULL);
+ EVP_DigestUpdate(md_ctx, buf, length);
+ EVP_DigestFinal_ex(md_ctx, digest, &md_len);
+ EVP_MD_CTX_free(md_ctx);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile
new file mode 100644
index 000000000..964baee11
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile
@@ -0,0 +1,19 @@
+CFLAGS += -I ../../include
+libs += ../../bin/isa-l_crypto.a
+tests = $(patsubst %test.c, %test, $(wildcard *_test.c))
+
+tests: $(tests)
+$(tests): $(libs)
+%test: %test.c
+ $(CC) $< $(libs) $(CFLAGS) $(LDLIBS) -o $@
+$(libs):
+ $(MAKE) -C ../../ -f Makefile.unx
+test: $(addsuffix .run,$(tests))
+ @echo ALL PASS
+$(addsuffix .run,$(tests)): %.run: %
+ $(SIM) ./$<
+ @echo Completed run: $<
+clean:
+ $(RM) *.o $(tests)
+
+$(tests): LDLIBS += -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake
new file mode 100644
index 000000000..daaf04e79
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/Makefile.nmake
@@ -0,0 +1,58 @@
+########################################################################
+# Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+tests = md5_mb_over_4GB_test.exe sha1_mb_over_4GB_test.exe \
+sha256_mb_over_4GB_test.exe sha512_mb_over_4GB_test.exe
+
+INCLUDES = -I../../include
+LINKFLAGS = /nologo
+INCLUDES = $(INCLUDES) -Ic:\OpenSSL-Win64\include
+CFLAGS = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) /c
+LINKFLAGS = $(LINKFLAGS) /libpath:c:\OpenSSL-Win64\lib
+LIBS = ../../isa-l_crypto.lib
+DLL = isa-l_crypto.dll
+
+tests: lib $(tests)
+$(tests): $(@B).obj
+ link /out:$@ $(LINKFLAGS) libeay32.lib $(LIBS) $*.obj
+%.obj: %.c
+ $(CC) $(CFLAGS) -Fo$@ $?
+
+lib:
+ cd ../../ && nmake -f Makefile.nmake
+ cd ../../ && copy $(DLL) "tests\extended"
+
+test: $(tests)
+ !$?
+ echo ALL PASS
+
+clean:
+ -if exist *.obj del *.obj
+ -if exist *.exe del *.exe
+ -if exist *.dll del *.dll
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c
new file mode 100644
index 000000000..5eb7be75f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/md5_mb_over_4GB_test.c
@@ -0,0 +1,155 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+#include "endian_helper.h"
+#include <openssl/md5.h>
+#define TEST_LEN (1024*1024ull) //1M
+#define TEST_BUFS MD5_MIN_LANES
+#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE (13*MD5_BLOCK_SIZE)
+#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[4 * MD5_DIGEST_NWORDS];
+
+struct user_data {
+ int idx;
+ uint64_t processed;
+};
+
+int main(void)
+{
+ MD5_CTX o_ctx; //openSSL
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, k, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ struct user_data udata[TEST_BUFS];
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+ md5_ctx_mgr_init(mgr);
+
+ printf("md5_large_test\n");
+
+ // Init ctx contents
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)&udata[i];
+ }
+
+ //Openssl MD5 update test
+ MD5_Init(&o_ctx);
+ for (k = 0; k < ROTATION_TIMES; k++) {
+ MD5_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+ }
+ MD5_Final(digest_ref_upd, &o_ctx);
+
+ // Initialize pool
+ for (i = 0; i < TEST_BUFS; i++) {
+ struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+ u->idx = i;
+ u->processed = 0;
+ }
+
+ printf("Starting updates\n");
+ int highest_pool_idx = 0;
+ ctx = &ctxpool[highest_pool_idx++];
+ while (ctx) {
+ int len = UPDATE_SIZE;
+ int update_type = HASH_UPDATE;
+ struct user_data *u = (struct user_data *)ctx->user_data;
+ int idx = u->idx;
+
+ if (u->processed == 0)
+ update_type = HASH_FIRST;
+
+ else if (hash_ctx_complete(ctx)) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = md5_ctx_mgr_flush(mgr);
+ continue;
+ } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+ len = (LEN_TOTAL - u->processed);
+ update_type = HASH_LAST;
+ }
+ u->processed += len;
+ ctx = md5_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+ if (NULL == ctx) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = md5_ctx_mgr_flush(mgr);
+ }
+ }
+
+ printf("multibuffer md5 digest: \n");
+ for (i = 0; i < TEST_BUFS; i++) {
+ printf("Total processing size of buf[%d] is %ld \n", i,
+ ctxpool[i].total_length);
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]);
+ }
+ }
+ printf("\n");
+
+ printf("openssl md5 update digest: \n");
+ for (i = 0; i < MD5_DIGEST_NWORDS; i++)
+ printf("%08X - ", to_le32(((uint32_t *) digest_ref_upd)[i]));
+ printf("\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref_upd)[j])) {
+ fail++;
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed md5 hash large file check %d\n", fail);
+ else
+ printf(" md5_hash_large_test: Pass\n");
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c
new file mode 100644
index 000000000..af94a8098
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha1_mb_over_4GB_test.c
@@ -0,0 +1,156 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+#include "endian_helper.h"
+#include <openssl/sha.h>
+#define TEST_LEN (1024*1024ull) //1M
+#define TEST_BUFS SHA1_MIN_LANES
+#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE (13*SHA1_BLOCK_SIZE)
+#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[4 * SHA1_DIGEST_NWORDS];
+
+struct user_data {
+ int idx;
+ uint64_t processed;
+};
+
+int main(void)
+{
+ SHA_CTX o_ctx; //openSSL
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, k, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ struct user_data udata[TEST_BUFS];
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ printf("sha1_large_test\n");
+
+ // Init ctx contents
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)&udata[i];
+ }
+
+ //Openssl SHA1 update test
+ SHA1_Init(&o_ctx);
+ for (k = 0; k < ROTATION_TIMES; k++) {
+ SHA1_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+ }
+ SHA1_Final(digest_ref_upd, &o_ctx);
+
+ // Initialize pool
+ for (i = 0; i < TEST_BUFS; i++) {
+ struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+ u->idx = i;
+ u->processed = 0;
+ }
+
+ printf("Starting updates\n");
+ int highest_pool_idx = 0;
+ ctx = &ctxpool[highest_pool_idx++];
+ while (ctx) {
+ int len = UPDATE_SIZE;
+ int update_type = HASH_UPDATE;
+ struct user_data *u = (struct user_data *)ctx->user_data;
+ int idx = u->idx;
+
+ if (u->processed == 0)
+ update_type = HASH_FIRST;
+
+ else if (hash_ctx_complete(ctx)) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = sha1_ctx_mgr_flush(mgr);
+ continue;
+ } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+ len = (LEN_TOTAL - u->processed);
+ update_type = HASH_LAST;
+ }
+ u->processed += len;
+ ctx = sha1_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+ if (NULL == ctx) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = sha1_ctx_mgr_flush(mgr);
+ }
+ }
+
+ printf("multibuffer SHA1 digest: \n");
+ for (i = 0; i < TEST_BUFS; i++) {
+ printf("Total processing size of buf[%d] is %ld \n", i,
+ ctxpool[i].total_length);
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]);
+ }
+ }
+ printf("\n");
+
+ printf("openssl SHA1 update digest: \n");
+ for (i = 0; i < SHA1_DIGEST_NWORDS; i++)
+ printf("%08X - ", to_be32(((uint32_t *) digest_ref_upd)[i]));
+ printf("\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ref_upd)[j])) {
+ fail++;
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed SHA1 hash large file check %d\n", fail);
+ else
+ printf(" SHA1_hash_large_test: Pass\n");
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c
new file mode 100644
index 000000000..35bbdcbae
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha256_mb_over_4GB_test.c
@@ -0,0 +1,156 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+#include "endian_helper.h"
+#include <openssl/sha.h>
+#define TEST_LEN (1024*1024ull) //1M
+#define TEST_BUFS SHA256_MIN_LANES
+#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE (13*SHA256_BLOCK_SIZE)
+#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[4 * SHA256_DIGEST_NWORDS];
+
+struct user_data {
+ int idx;
+ uint64_t processed;
+};
+
+int main(void)
+{
+ SHA256_CTX o_ctx; //openSSL
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, k, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ struct user_data udata[TEST_BUFS];
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ printf("sha256_large_test\n");
+
+ // Init ctx contents
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)&udata[i];
+ }
+
+ //Openssl SHA256 update test
+ SHA256_Init(&o_ctx);
+ for (k = 0; k < ROTATION_TIMES; k++) {
+ SHA256_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+ }
+ SHA256_Final(digest_ref_upd, &o_ctx);
+
+ // Initialize pool
+ for (i = 0; i < TEST_BUFS; i++) {
+ struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+ u->idx = i;
+ u->processed = 0;
+ }
+
+ printf("Starting updates\n");
+ int highest_pool_idx = 0;
+ ctx = &ctxpool[highest_pool_idx++];
+ while (ctx) {
+ int len = UPDATE_SIZE;
+ int update_type = HASH_UPDATE;
+ struct user_data *u = (struct user_data *)ctx->user_data;
+ int idx = u->idx;
+
+ if (u->processed == 0)
+ update_type = HASH_FIRST;
+
+ else if (hash_ctx_complete(ctx)) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = sha256_ctx_mgr_flush(mgr);
+ continue;
+ } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+ len = (LEN_TOTAL - u->processed);
+ update_type = HASH_LAST;
+ }
+ u->processed += len;
+ ctx = sha256_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+ if (NULL == ctx) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = sha256_ctx_mgr_flush(mgr);
+ }
+ }
+
+ printf("multibuffer SHA256 digest: \n");
+ for (i = 0; i < TEST_BUFS; i++) {
+ printf("Total processing size of buf[%d] is %ld \n", i,
+ ctxpool[i].total_length);
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]);
+ }
+ }
+ printf("\n");
+
+ printf("openssl SHA256 update digest: \n");
+ for (i = 0; i < SHA256_DIGEST_NWORDS; i++)
+ printf("%08X - ", to_be32(((uint32_t *) digest_ref_upd)[i]));
+ printf("\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be32(((uint32_t *) digest_ref_upd)[j])) {
+ fail++;
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed SHA256 hash large file check %d\n", fail);
+ else
+ printf(" SHA256_hash_large_test: Pass\n");
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c
new file mode 100644
index 000000000..9c2aeaead
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sha512_mb_over_4GB_test.c
@@ -0,0 +1,156 @@
+/**********************************************************************
+ Copyright(c) 2011-2017 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+#include "endian_helper.h"
+#include <openssl/sha.h>
+#define TEST_LEN (1024*1024ull) //1M
+#define TEST_BUFS SHA512_MIN_LANES
+#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE (13*SHA512_BLOCK_SIZE)
+#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[8 * SHA512_DIGEST_NWORDS];
+
+struct user_data {
+ int idx;
+ uint64_t processed;
+};
+
+int main(void)
+{
+ SHA512_CTX o_ctx; //openSSL
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, k, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ struct user_data udata[TEST_BUFS];
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sha512_ctx_mgr_init(mgr);
+
+ printf("sha512_large_test\n");
+
+ // Init ctx contents
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)&udata[i];
+ }
+
+ //Openssl SHA512 update test
+ SHA512_Init(&o_ctx);
+ for (k = 0; k < ROTATION_TIMES; k++) {
+ SHA512_Update(&o_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+ }
+ SHA512_Final(digest_ref_upd, &o_ctx);
+
+ // Initialize pool
+ for (i = 0; i < TEST_BUFS; i++) {
+ struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+ u->idx = i;
+ u->processed = 0;
+ }
+
+ printf("Starting updates\n");
+ int highest_pool_idx = 0;
+ ctx = &ctxpool[highest_pool_idx++];
+ while (ctx) {
+ int len = UPDATE_SIZE;
+ int update_type = HASH_UPDATE;
+ struct user_data *u = (struct user_data *)ctx->user_data;
+ int idx = u->idx;
+
+ if (u->processed == 0)
+ update_type = HASH_FIRST;
+
+ else if (hash_ctx_complete(ctx)) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = sha512_ctx_mgr_flush(mgr);
+ continue;
+ } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+ len = (LEN_TOTAL - u->processed);
+ update_type = HASH_LAST;
+ }
+ u->processed += len;
+ ctx = sha512_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+ if (NULL == ctx) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = sha512_ctx_mgr_flush(mgr);
+ }
+ }
+
+ printf("multibuffer sha512 digest: \n");
+ for (i = 0; i < TEST_BUFS; i++) {
+ printf("Total processing size of buf[%d] is %ld \n", i,
+ ctxpool[i].total_length);
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ printf("digest%d : %016lX\n", j, ctxpool[i].job.result_digest[j]);
+ }
+ }
+ printf("\n");
+
+ printf("openssl sha512 update digest: \n");
+ for (i = 0; i < SHA512_DIGEST_NWORDS; i++)
+ printf("%016lX - ", to_be64(((uint64_t *) digest_ref_upd)[i]));
+ printf("\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_be64(((uint64_t *) digest_ref_upd)[j])) {
+ fail++;
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed sha512 hash large file check %d\n", fail);
+ else
+ printf(" sha512_hash_large_test: Pass\n");
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c b/src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c
new file mode 100644
index 000000000..ea98e29b5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tests/extended/sm3_mb_over_4GB_test.c
@@ -0,0 +1,162 @@
+/**********************************************************************
+ Copyright(c) 2011-2019 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sm3_mb.h"
+#include "endian_helper.h"
+#include <openssl/evp.h>
+
+#define TEST_LEN (1024*1024ull) //1M
+#define TEST_BUFS SM3_MAX_LANES
+#define ROTATION_TIMES 10000 //total length processing = TEST_LEN * ROTATION_TIMES
+#define UPDATE_SIZE (13*SM3_BLOCK_SIZE)
+#define LEN_TOTAL (TEST_LEN * ROTATION_TIMES)
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ref_upd[4 * SM3_DIGEST_NWORDS];
+
+struct user_data {
+ int idx;
+ uint64_t processed;
+};
+
+int main(void)
+{
+ SM3_HASH_CTX_MGR *mgr = NULL;
+ SM3_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, k, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ struct user_data udata[TEST_BUFS];
+ EVP_MD_CTX *md_ctx;
+ const EVP_MD *md;
+ unsigned int md_len;
+ int ret;
+
+ ret = posix_memalign((void *)&mgr, 16, sizeof(SM3_HASH_CTX_MGR));
+ if ((ret != 0) || (mgr == NULL)) {
+ printf("posix_memalign failed test aborted\n");
+ return 1;
+ }
+
+ sm3_ctx_mgr_init(mgr);
+
+ printf("sm3_large_test\n");
+
+ // Init ctx contents
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t)TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)&udata[i];
+ }
+
+ //Openssl SM3 update test
+ md = EVP_sm3();
+ md_ctx = EVP_MD_CTX_new();
+ EVP_DigestInit_ex(md_ctx, md, NULL);
+ for (k = 0; k < ROTATION_TIMES; k++) {
+ EVP_DigestUpdate(md_ctx, bufs[k % TEST_BUFS], TEST_LEN);
+ }
+ EVP_DigestFinal_ex(md_ctx, digest_ref_upd, &md_len);
+ EVP_MD_CTX_free(md_ctx);
+
+ // Initialize pool
+ for (i = 0; i < TEST_BUFS; i++) {
+ struct user_data *u = (struct user_data *)ctxpool[i].user_data;
+ u->idx = i;
+ u->processed = 0;
+ }
+
+ printf("Starting updates\n");
+ int highest_pool_idx = 0;
+ ctx = &ctxpool[highest_pool_idx++];
+ while (ctx) {
+ int len = UPDATE_SIZE;
+ int update_type = HASH_UPDATE;
+ struct user_data *u = (struct user_data *)ctx->user_data;
+ int idx = u->idx;
+
+ if (u->processed == 0)
+ update_type = HASH_FIRST;
+
+ else if (hash_ctx_complete(ctx)) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = sm3_ctx_mgr_flush(mgr);
+ continue;
+ } else if (u->processed >= (LEN_TOTAL - UPDATE_SIZE)) {
+ len = (LEN_TOTAL - u->processed);
+ update_type = HASH_LAST;
+ }
+ u->processed += len;
+ ctx = sm3_ctx_mgr_submit(mgr, ctx, bufs[idx], len, update_type);
+
+ if (NULL == ctx) {
+ if (highest_pool_idx < TEST_BUFS)
+ ctx = &ctxpool[highest_pool_idx++];
+ else
+ ctx = sm3_ctx_mgr_flush(mgr);
+ }
+ }
+
+ printf("multibuffer SM3 digest: \n");
+ for (i = 0; i < TEST_BUFS; i++) {
+ printf("Total processing size of buf[%d] is %ld \n", i,
+ ctxpool[i].total_length);
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ printf("digest%d : %08X\n", j, ctxpool[i].job.result_digest[j]);
+ }
+ }
+ printf("\n");
+
+ printf("openssl SM3 update digest: \n");
+ for (i = 0; i < SM3_DIGEST_NWORDS; i++)
+ printf("%08X - ", to_le32(((uint32_t *) digest_ref_upd)[i]));
+ printf("\n");
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SM3_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ to_le32(((uint32_t *) digest_ref_upd)[j])) {
+ fail++;
+ }
+ }
+ }
+
+ if (fail)
+ printf("Test failed SM3_hash_large check %d\n", fail);
+ else
+ printf(" SM3_hash_large_test: Pass\n");
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/check_format.sh b/src/crypto/isa-l/isa-l_crypto/tools/check_format.sh
new file mode 100755
index 000000000..8c67a931c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/check_format.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+
+set -e
+rc=0
+verbose=0
+indent_args='-linux -l95 -cp1 -lps -il6 -ncs'
+function iver { printf "%03d%03d%03d%03d" $(echo "$@" | sed 's/GNU indent//' | tr '.' ' '); }
+
+while [ -n "$*" ]; do
+ case "$1" in
+ -v )
+ verbose=1
+ shift
+ ;;
+ -h )
+ echo check_format.sh [-h -v]
+ exit 0
+ ;;
+ esac
+done
+
+echo "Checking format of files in the git index at $PWD"
+if ! git rev-parse --is-inside-work-tree >& /dev/null; then
+ echo "Not in a git repo: Fail"
+ exit 1
+fi
+
+if hash indent && [ $(iver $(indent --version)) -ge $(iver 2.2.12) ]; then
+ echo "Checking C files for coding style..."
+ for f in `git ls-files '*.c'`; do
+ [ "$verbose" -gt 0 ] && echo "checking style on $f"
+ if ! indent $indent_args -st $f | diff -q $f - >& /dev/null; then
+ echo " File found with formatting issues: $f"
+ [ "$verbose" -gt 0 ] 2> /dev/null && indent $indent_args -st $f | diff -u $f -
+ rc=1
+ fi
+ done
+ [ "$rc" -gt 0 ] && echo " Run ./tools/iindent on files"
+else
+ echo "You do not have a recent indent installed so your code style is not being checked!"
+fi
+
+if hash grep; then
+ echo "Checking for dos and whitespace violations..."
+ for f in $(git ls-files); do
+ [ "$verbose" -gt 0 ] && echo "checking whitespace on $f"
+ if grep -q '[[:space:]]$' $f ; then
+ echo " File found with trailing whitespace: $f"
+ rc=1
+ fi
+ if grep -q $'\r' $f ; then
+ echo " File found with dos formatting: $f"
+ rc=1
+ fi
+ done
+fi
+
+echo "Checking source files for permissions..."
+while read -r perm _res0 _res1 f; do
+ [ -z "$f" ] && continue
+ [ "$verbose" -gt 0 ] && echo "checking permissions on $f"
+ if [ "$perm" -ne 100644 ]; then
+ echo " File found with permissions issue ($perm): $f"
+ rc=1
+ fi
+done <<< $(git ls-files -s -- ':(exclude)*.sh' ':(exclude)*iindent')
+
+echo "Checking script files for permissions..."
+while read -r perm _res0 _res1 f; do
+ [ -z "$f" ] && continue
+ [ "$verbose" -gt 0 ] && echo "checking permissions on $f"
+ if [ "$perm" -ne 100755 ]; then
+ echo " Script found with permissions issue ($perm): $f"
+ rc=1
+ fi
+done <<< $(git ls-files -s '*.sh')
+
+
+echo "Checking for signoff in commit message..."
+if ! git log -n 1 --format=%B | grep -q "^Signed-off-by:" ; then
+ echo " Commit not signed off. Please read src/CONTRIBUTING.md"
+ rc=1
+fi
+
+[ "$rc" -gt 0 ] && echo Format Fail || echo Format Pass
+
+exit $rc
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk b/src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk
new file mode 100644
index 000000000..f2c8b46ed
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/gen_nmake.mk
@@ -0,0 +1,123 @@
+# Regenerate nmake file from makefiles or check its consistency
+
+test_nmake_file: tst.nmake
+ @diff -u Makefile.nmake tst.nmake || (echo Potential nmake consistency issue; $(RM) tst.nmake; false;)
+ @echo No nmake consistency issues
+ @$(RM) tst.nmake
+
+FORCE:
+Makefile.nmake tst.nmake: FORCE
+ @echo Regenerating $@
+ @echo '########################################################################' > $@
+ @cat LICENSE | sed -e 's/^/#/ ' >> $@
+ @echo '########################################################################' >> $@
+ @echo '' >> $@
+ @echo '# This file can be auto-regenerated with $$make -f Makefile.unx Makefile.nmake' >> $@
+ @echo '' >> $@
+ @echo -n 'objs =' >> $@
+ @$(foreach o, $(subst /,\\,$(objs:.o=.obj)), printf " %s\n\t%s" \\ $(o) >> $@; )
+ @echo '' >> $@
+ @echo '' >> $@
+ @echo 'INCLUDES = $(INCLUDE)' >> $@
+ @echo '# Modern asm feature level, consider upgrading nasm/yasm before decreasing feature_level' >> $@
+ @echo 'FEAT_FLAGS = -DHAVE_AS_KNOWS_AVX512 -DAS_FEATURE_LEVEL=10 -DHAVE_AS_KNOWS_SHANI' >> $@
+ @echo 'CFLAGS_REL = -O2 -DNDEBUG /Z7 /MD /Gy' >> $@
+ @echo 'CFLAGS_DBG = -Od -DDEBUG /Z7 /MDd' >> $@
+ @echo 'LINKFLAGS = -nologo -incremental:no -debug' >> $@
+ @echo 'CFLAGS = $$(CFLAGS_REL) -nologo -D_USE_MATH_DEFINES $$(FEAT_FLAGS) $$(INCLUDES) $$(D)' >> $@
+ @echo 'AFLAGS = -f win64 $$(FEAT_FLAGS) $$(INCLUDES) $$(D)' >> $@
+ @echo 'CC = cl' >> $@
+ @echo '# or CC = icl -Qstd=c99' >> $@
+ @echo 'AS = nasm' >> $@
+ @echo '' >> $@
+ @echo 'lib: bin static dll' >> $@
+ @echo 'static: bin isa-l_crypto_static.lib' >> $@
+ @echo 'dll: bin isa-l_crypto.dll' >> $@
+ @echo '' >> $@
+ @echo 'bin: ; -mkdir $$@' >> $@
+ @echo '' >> $@
+ @echo 'isa-l_crypto_static.lib: $$(objs)' >> $@
+ @echo ' lib -out:$$@ @<<' >> $@
+ @echo '$$?' >> $@
+ @echo '<<' >> $@
+ @echo '' >> $@
+ @echo 'isa-l_crypto.dll: $$(objs)' >> $@
+ @echo ' link -out:$$@ -dll -def:isa-l_crypto.def $$(LINKFLAGS) @<<' >> $@
+ @echo '$$?' >> $@
+ @echo '<<' >> $@
+ @echo '' >> $@
+ @$(foreach b, $(units), \
+ printf "{%s}.c.obj:\n\t\$$(CC) \$$(CFLAGS) /c -Fo\$$@ \$$?\n{%s}.asm.obj:\n\t\$$(AS) \$$(AFLAGS) -o \$$@ \$$?\n\n" $(b) $(b) >> $@; )
+ @echo '' >> $@
+ifneq (,$(examples))
+ @echo "# Examples" >> $@
+ @echo -n 'ex =' >> $@
+ @$(foreach ex, $(notdir $(examples)), printf " %s\n\t%s.exe" \\ $(ex) >> $@; )
+ @echo '' >> $@
+ @echo '' >> $@
+ @echo 'ex: lib $$(ex)' >> $@
+ @echo '' >> $@
+ @echo '$$(ex): $$(@B).obj' >> $@
+endif
+ @echo '' >> $@
+ @echo '.obj.exe:' >> $@
+ @echo ' link /out:$$@ $$(LINKFLAGS) isa-l_crypto.lib $$?' >> $@
+ @echo '' >> $@
+ @echo '# Check tests' >> $@
+ @echo -n 'checks =' >> $@
+ @$(foreach check, $(notdir $(check_tests)), printf " %s\n\t%s.exe" \\ $(check) >> $@; )
+ @echo '' >> $@
+ @echo '' >> $@
+ @echo 'checks: lib $$(checks)' >> $@
+ @echo '$$(checks): $$(@B).obj' >> $@
+ @echo 'check: $$(checks)' >> $@
+ @echo ' !$$?' >> $@
+ @echo '' >> $@
+ @echo '# Unit tests' >> $@
+ @echo -n 'tests =' >> $@
+ @$(foreach test, $(notdir $(unit_tests)), printf " %s\n\t%s.exe" \\ $(test) >> $@; )
+ @echo '' >> $@
+ @echo '' >> $@
+ @echo 'tests: lib $$(tests)' >> $@
+ @echo '$$(tests): $$(@B).obj' >> $@
+ @echo '' >> $@
+ @echo '# Performance tests' >> $@
+ @echo -n 'perfs =' >> $@
+ @$(foreach perf, $(notdir $(perf_tests)), printf " %s\n\t%s.exe" \\ $(perf) >> $@; )
+ @echo '' >> $@
+ @echo '' >> $@
+ @echo 'perfs: lib $$(perfs)' >> $@
+ @echo '$$(perfs): $$(@B).obj' >> $@
+ @echo '' >> $@
+ @echo -n 'progs =' >> $@
+ @$(foreach prog, $(notdir $(bin_PROGRAMS)), printf " %s\n\t%s.exe" \\ $(prog) >> $@; )
+ @echo '' >> $@
+ @echo '' >> $@
+ @echo 'progs: lib $$(progs)' >> $@
+ @$(foreach p, $(notdir $(bin_PROGRAMS)), \
+ printf "%s.exe: %s\n\tlink /out:\$$@ \$$(LINKFLAGS) isa-l_crypto.lib \$$?\n" $(p) $(subst /,\\,$(programs_$(p)_SOURCES:.c=.obj)) >> $@; )
+ @echo '' >> $@
+ @echo 'clean:' >> $@
+ @echo ' -if exist *.obj del *.obj' >> $@
+ @echo ' -if exist bin\*.obj del bin\*.obj' >> $@
+ @echo ' -if exist isa-l_crypto_static.lib del isa-l_crypto_static.lib' >> $@
+ @echo ' -if exist *.exe del *.exe' >> $@
+ @echo ' -if exist *.pdb del *.pdb' >> $@
+ @echo ' -if exist isa-l_crypto.lib del isa-l_crypto.lib' >> $@
+ @echo ' -if exist isa-l_crypto.dll del isa-l_crypto.dll' >> $@
+ @echo ' -if exist isa-l_crypto.exp del isa-l_crypto.exp' >> $@
+ @echo '' >> $@
+ @echo 'libcrypto.lib:' >> $@
+ @cat $(foreach unit,$(units), $(unit)/Makefile.am) | sed \
+ -e '/: /!d' \
+ -e 's/\([^ :]*\)[ ]*/\1.exe /g' \
+ -e :c -e 's/:\(.*\).exe/:\1/;tc' \
+ -e 's/\.o[ $$]/.obj /g' \
+ -e 's/\.o\.exe[ ]:/.obj:/g' \
+ -e '/CFLAGS_.*+=/d' \
+ -e '/:.*\%.*:/d' \
+ -e 's/ :/:/' \
+ -e 's/LDLIBS *+=//' \
+ -e 's/-lcrypto/libcrypto.lib/' \
+ -e 's/ $$//' \
+ >> $@
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/iindent b/src/crypto/isa-l/isa-l_crypto/tools/iindent
new file mode 100755
index 000000000..48d26360f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/iindent
@@ -0,0 +1,2 @@
+#!/bin/sh
+indent -linux -l95 -cp1 -lps -il6 -ncs "$@"
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh
new file mode 100755
index 000000000..19e03856c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/nasm-cet-filter.sh
@@ -0,0 +1,56 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+ case "$1" in
+ -o )
+ # Supported options with arg
+ options="$options $1 $2"
+ shift
+ object="$1"
+ shift
+ ;;
+ -f | -D )
+ # Supported options with arg
+ options="$options $1 $2"
+ shift
+ shift
+ ;;
+ -I | -i )
+ options="$options $1 $2/"
+ shift
+ shift
+ ;;
+ --prefix* )
+ # Supported options without arg
+ options="$options $1"
+ shift
+ ;;
+ -I* | -i* )
+ options="$options $1/"
+ shift
+ ;;
+ -D* ) # For defines we need to remove spaces
+ case "$1" in
+ *' '* ) ;;
+ *) options="$options $1" ;;
+ esac
+ shift
+ ;;
+ #-blah )
+ # Unsupported options with args - none known
+ -* )
+ # Unsupported options with no args
+ shift
+ ;;
+ * )
+ args="$args $1"
+ shift
+ ;;
+ esac
+done
+
+nasm $options $args
+$CET_LD -r -z ibt -z shstk -o $object.tmp $object
+mv $object.tmp $object
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh
new file mode 100755
index 000000000..5ec9ba3f3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh
@@ -0,0 +1,47 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+ case "$1" in
+ -f | -o | -D )
+ # Supported options with arg
+ options="$options $1 $2"
+ shift
+ shift
+ ;;
+ -I | -i )
+ options="$options $1 $2/"
+ shift
+ shift
+ ;;
+ --prefix* )
+ # Supported options without arg
+ options="$options $1"
+ shift
+ ;;
+ -I* | -i* )
+ options="$options $1/"
+ shift
+ ;;
+ -D* ) # For defines we need to remove spaces
+ case "$1" in
+ *' '* ) ;;
+ *) options="$options $1" ;;
+ esac
+ shift
+ ;;
+ #-blah )
+ # Unsupported options with args - none known
+ -* )
+ # Unsupported options with no args
+ shift
+ ;;
+ * )
+ args="$args $1"
+ shift
+ ;;
+ esac
+done
+
+nasm $options $args
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh b/src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh
new file mode 100755
index 000000000..bb82b9fa5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/remove_trailing_whitespace.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+sed -i -i.bak 's/[[:blank:]]*$//' "$@"
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh
new file mode 100755
index 000000000..756e1e069
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/test_autorun.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+set -e #exit on fail
+
+# Override defaults if exist
+READLINK=readlink
+command -V greadlink >/dev/null 2>&1 && READLINK=greadlink
+
+
+out="$PWD"
+src=$($READLINK -f $(dirname $0))/..
+cd "$src"
+
+# Echo environment info
+if test -d .git; then
+ branch=$(git describe)
+ commitid=$(git rev-parse HEAD)
+ brief=$(git log -1 --format='%s')
+ branch_changes=$(git diff --shortstat)
+fi
+if command -V uname >/dev/null 2>&1; then
+ node=$(uname -n)
+ os_name=$(uname -s)
+ os_all=$(uname -a)
+fi
+
+echo "Test report v1"
+echo "branch: $branch"
+echo "brief: $brief"
+echo "commitid: $commitid"
+echo "node: $node"
+echo "os_name: $os_name"
+echo "os_all: $os_all"
+echo "test_args: $@"
+echo "changes: $branch_changes"
+command -V lscpu > /dev/null 2>&1 && lscpu
+
+# Start tests
+
+[ -z "$1" ] && ./tools/test_checks.sh
+
+while [ -n "$1" ]; do
+ case "$1" in
+ check )
+ ./tools/test_checks.sh
+ shift ;;
+ ext )
+ ./tools/test_extended.sh
+ shift ;;
+ format )
+ shift ;;
+ all )
+ ./tools/test_checks.sh
+ ./tools/test_extended.sh
+ shift ;;
+ * )
+ echo $0 undefined option: $1
+ shift ;;
+ esac
+done
+
+./tools/check_format.sh
+
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh
new file mode 100755
index 000000000..9573554db
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/test_checks.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+set -xe #exit on fail
+
+# Defaults
+cpus=1
+S=$RANDOM
+MAKE=make
+READLINK=readlink
+
+# Override defaults if exist
+command -V gmake >/dev/null 2>&1 && MAKE=gmake
+command -V greadlink >/dev/null 2>&1 && READLINK=greadlink
+
+out="$PWD"
+src=$($READLINK -f $(dirname $0))/..
+source $src/tools/test_tools.sh
+cd "$src"
+tmp_install_dir=$out/tmp_install
+
+# Run on mult cpus
+if command -V lscpu >/dev/null 2>&1; then
+ cpus=`lscpu -p | tail -1 | cut -d, -f 2`
+ cpus=$(($cpus + 1))
+elif command -V sysctl; then
+ if sysctl -n hw.ncpu >/dev/null 2>&1; then
+ cpus=$(sysctl -n hw.ncpu)
+ cpus=$(($cpus + 1))
+ fi
+fi
+echo "Using $cpus cpu threads"
+
+# Pick a random test seed
+if [ -z "$S" ]; then
+ S=`tr -cd 0-9 </dev/urandom | head -c 4 | sed -e 's/^0*/1/g'`
+ [ "$S" -gt 0 ] 2> /dev/null || S="123"
+fi
+echo "Running with TEST_SEED=$S"
+
+# Fix Darwin issues
+if uname | grep -q 'Darwin' 2>&1; then
+ export SED=`which sed`
+ opt_config_target='--target=darwin'
+fi
+
+# Tests
+time ./autogen.sh
+time ./configure --prefix=$tmp_install_dir $opt_config_target
+time $MAKE -j $cpus
+test_start "check_tests"
+time $MAKE check -j $cpus D="-D TEST_SEED=$S"
+test_end "check_tests" $?
+test_start "installation_test"
+time $MAKE install
+test_end "installation_test" $?
+
+# Check for gnu executable stack set
+if command -V readelf >/dev/null 2>&1; then
+ if readelf -W -l $tmp_install_dir/lib/libisal_crypto.so | grep 'GNU_STACK' | grep -q 'RWE'; then
+ echo Stack NX check $tmp_install_dir/lib/libisal_crypto.so Fail
+ exit 1
+ else
+ echo Stack NX check $tmp_install_dir/lib/libisal_crypto.so Pass
+ fi
+else
+ echo Stack NX check not supported
+fi
+
+$MAKE clean
+
+
+
+echo $0: Pass
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh
new file mode 100755
index 000000000..b79cbb0c1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/test_extended.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/env bash
+
+# Extended tests: Run a few more options other than make check
+
+set -xe #exit on fail
+
+# Defaults
+cpus=1
+S=$RANDOM
+MAKE=make
+READLINK=readlink
+test_level=check
+build_opt=''
+msg=''
+
+# Override defaults if exist
+command -V gmake >/dev/null 2>&1 && MAKE=gmake
+command -V greadlink >/dev/null 2>&1 && READLINK=greadlink
+[ -n "$CC" ] && build_opt+="CC=$CC "
+[ -n "$AS" ] && build_opt+="AS=$AS "
+
+out="$PWD"
+src=$($READLINK -f $(dirname $0))/..
+source $src/tools/test_tools.sh
+cd "$src"
+
+# Run on mult cpus
+if command -V lscpu >/dev/null 2>&1; then
+ cpus=`lscpu -p | tail -1 | cut -d, -f 2`
+ cpus=$(($cpus + 1))
+elif command -V sysctl; then
+ if sysctl -n hw.ncpu >/dev/null 2>&1; then
+ cpus=$(sysctl -n hw.ncpu)
+ cpus=$(($cpus + 1))
+ fi
+fi
+echo "Using $cpus cpu threads"
+
+if [ -z "$S" ]; then
+ S=`tr -cd 0-9 </dev/urandom | head -c 4 | sed -e 's/^0*/1/g'`
+ [ "$S" -gt 0 ] 2> /dev/null || S="123"
+fi
+msg+="Running with TEST_SEED=$S".$'\n'
+
+# Fix Darwin issues
+if uname | grep -q 'Darwin' 2>&1; then
+ export SED=`which sed`
+fi
+
+# Check for test libs to add
+if command -V ldconfig >/dev/null 2>&1; then
+ if ldconfig -p | grep -q libcrypto.so; then
+ test_level=test
+ msg+=$'With extra tests\n'
+ fi
+ if ldconfig -p | grep -q libefence.so; then
+ build_opt+="LDFLAGS+='-lefence' "
+ msg+=$'With efence\n'
+ fi
+fi
+
+# Std makefile build test
+$MAKE -f Makefile.unx clean
+test_start "extended_build_test"
+time $MAKE -f Makefile.unx -j $cpus $build_opt
+test_end "extended_build_test" $?
+msg+=$'Std makefile build: Pass\n'
+
+# Check for gnu executable stack set
+if command -V readelf >/dev/null 2>&1; then
+ test_start "stack_nx_check"
+ if readelf -W -l bin/libisal_crypto.so | grep 'GNU_STACK' | grep -q 'RWE'; then
+ echo $0: Stack NX check bin/libisal_crypto.so: Fail
+ test_end "stack_nx_check" 1
+ exit 1
+ else
+ test_end "stack_nx_check" 0
+ msg+=$'Stack NX check bin/lib/libisal_crypto.so: Pass\n'
+ fi
+else
+ msg+=$'Stack NX check not supported: Skip\n'
+fi
+
+# Std makefile build perf tests
+test_start "extended_perf_test"
+time $MAKE -f Makefile.unx -j $cpus perfs
+test_end "extended_perf_test" $?
+msg+=$'Std makefile build perf: Pass\n'
+
+# Std makefile run tests
+test_start "extended_makefile_tests"
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" $test_level
+test_end "extended_makefile_tests" $?
+msg+=$'Std makefile tests: Pass\n'
+
+# Std makefile build other
+test_start "extended_other_tests"
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" other
+test_end "extended_other_tests" $?
+msg+=$'Other tests build: Pass\n'
+
+$MAKE -f Makefile.unx clean
+
+# Std makefile run tests with NT_LDST
+test_start "extended_makefile_tests with NT_LDST"
+$MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S NT_LDST"
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S NT_LDST" $test_level
+test_end "extended_makefile_tests with NT_LDST" $?
+msg+=$'Std makefile tests: Pass\n'
+
+$MAKE -f Makefile.unx clean
+
+# noarch makefile run tests
+test_start "extended_makefile_tests"
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" \
+ arch=noarch
+time $MAKE -f Makefile.unx -j $cpus $build_opt D="TEST_SEED=$S" \
+ arch=noarch $test_level
+test_end "extended_makefile_tests" $?
+msg+=$'noarch makefile tests: Pass\n'
+
+set +x
+echo
+echo "Summary test $0:"
+echo "Build opt: $build_opt"
+echo "$msg"
+echo "$0: Final: Pass"
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh b/src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh
new file mode 100755
index 000000000..448b1f92b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/test_tools.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+function test_start()
+{
+ echo "entering test: $1"
+}
+
+function test_end()
+{
+ echo "leaving test: $1 status: $2"
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh
new file mode 100755
index 000000000..d7b3e973d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/yasm-cet-filter.sh
@@ -0,0 +1,47 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+ case "$1" in
+ -o )
+ # Supported options with arg
+ options="$options $1 $2"
+ shift
+ object="$1"
+ shift
+ ;;
+ -f | -I | -i | -D )
+ # Supported options with arg
+ options="$options $1 $2"
+ shift
+ shift
+ ;;
+ -I* | -i* | --prefix* )
+ # Supported options without arg
+ options="$options $1"
+ shift
+ ;;
+ -D* ) # For defines we need to remove spaces
+ case "$1" in
+ *' '* ) ;;
+ *) options="$options $1" ;;
+ esac
+ shift
+ ;;
+ #-blah )
+ # Unsupported options with args - none known
+ -* )
+ # Unsupported options with no args
+ shift
+ ;;
+ * )
+ args="$args $1"
+ shift
+ ;;
+ esac
+done
+
+yasm $options $args
+$CET_LD -r -z ibt -z shstk -o $object.tmp $object
+mv $object.tmp $object
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh
new file mode 100755
index 000000000..c33952a40
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh
@@ -0,0 +1,38 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+ case "$1" in
+ -f | -o | -I | -i | -D )
+ # Supported options with arg
+ options="$options $1 $2"
+ shift
+ shift
+ ;;
+ -I* | -i* | --prefix* )
+ # Supported options without arg
+ options="$options $1"
+ shift
+ ;;
+ -D* ) # For defines we need to remove spaces
+ case "$1" in
+ *' '* ) ;;
+ *) options="$options $1" ;;
+ esac
+ shift
+ ;;
+ #-blah )
+ # Unsupported options with args - none known
+ -* )
+ # Unsupported options with no args
+ shift
+ ;;
+ * )
+ args="$args $1"
+ shift
+ ;;
+ esac
+done
+
+yasm $options $args
diff --git a/src/crypto/isa-l/isal_crypto_accel.cc b/src/crypto/isa-l/isal_crypto_accel.cc
new file mode 100644
index 000000000..7dccf64fd
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_accel.cc
@@ -0,0 +1,43 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "crypto/isa-l/isal_crypto_accel.h"
+
+#include "crypto/isa-l/isa-l_crypto/include/aes_cbc.h"
+
+bool ISALCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+ alignas(16) struct cbc_key_data keys_blk;
+ aes_cbc_precomp(const_cast<unsigned char*>(&key[0]), AES_256_KEYSIZE, &keys_blk);
+ aes_cbc_enc_256(const_cast<unsigned char*>(in),
+ const_cast<unsigned char*>(&iv[0]), keys_blk.enc_keys, out, size);
+ return true;
+}
+bool ISALCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+ alignas(16) struct cbc_key_data keys_blk;
+ aes_cbc_precomp(const_cast<unsigned char*>(&key[0]), AES_256_KEYSIZE, &keys_blk);
+ aes_cbc_dec_256(const_cast<unsigned char*>(in), const_cast<unsigned char*>(&iv[0]), keys_blk.dec_keys, out, size);
+ return true;
+}
diff --git a/src/crypto/isa-l/isal_crypto_accel.h b/src/crypto/isa-l/isal_crypto_accel.h
new file mode 100644
index 000000000..84331bbdd
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_accel.h
@@ -0,0 +1,31 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef ISAL_CRYPTO_ACCEL_H
+#define ISAL_CRYPTO_ACCEL_H
+#include "crypto/crypto_accel.h"
+
+class ISALCryptoAccel : public CryptoAccel {
+ public:
+ ISALCryptoAccel() {}
+ virtual ~ISALCryptoAccel() {}
+
+ bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+ bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+};
+#endif
diff --git a/src/crypto/isa-l/isal_crypto_plugin.cc b/src/crypto/isa-l/isal_crypto_plugin.cc
new file mode 100644
index 000000000..85f0e5f0f
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_plugin.cc
@@ -0,0 +1,34 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczykd@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+
+// -----------------------------------------------------------------------------
+#include "crypto/isa-l/isal_crypto_plugin.h"
+
+#include "ceph_ver.h"
+// -----------------------------------------------------------------------------
+
+const char *__ceph_plugin_version()
+{
+ return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct,
+ const std::string& type,
+ const std::string& name)
+{
+ auto instance = cct->get_plugin_registry();
+
+ return instance->add(type, name, new ISALCryptoPlugin(cct));
+}
diff --git a/src/crypto/isa-l/isal_crypto_plugin.h b/src/crypto/isa-l/isal_crypto_plugin.h
new file mode 100644
index 000000000..68e782e69
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_plugin.h
@@ -0,0 +1,47 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef ISAL_CRYPTO_PLUGIN_H
+#define ISAL_CRYPTO_PLUGIN_H
+// -----------------------------------------------------------------------------
+#include "crypto/crypto_plugin.h"
+#include "crypto/isa-l/isal_crypto_accel.h"
+#include "arch/intel.h"
+#include "arch/probe.h"
+// -----------------------------------------------------------------------------
+
+
+class ISALCryptoPlugin : public CryptoPlugin {
+
+public:
+
+ explicit ISALCryptoPlugin(CephContext* cct) : CryptoPlugin(cct)
+ {}
+ ~ISALCryptoPlugin()
+ {}
+ virtual int factory(CryptoAccelRef *cs,
+ std::ostream *ss)
+ {
+ if (cryptoaccel == nullptr)
+ {
+ ceph_arch_probe();
+ if (ceph_arch_intel_aesni && ceph_arch_intel_sse41) {
+ cryptoaccel = CryptoAccelRef(new ISALCryptoAccel);
+ }
+ }
+ *cs = cryptoaccel;
+ return 0;
+ }
+};
+#endif
diff --git a/src/crypto/openssl/CMakeLists.txt b/src/crypto/openssl/CMakeLists.txt
new file mode 100644
index 000000000..6ede1567f
--- /dev/null
+++ b/src/crypto/openssl/CMakeLists.txt
@@ -0,0 +1,14 @@
+## openssl
+
+set(openssl_crypto_plugin_srcs
+ openssl_crypto_accel.cc
+ openssl_crypto_plugin.cc)
+
+add_library(ceph_crypto_openssl SHARED ${openssl_crypto_plugin_srcs})
+target_link_libraries(ceph_crypto_openssl
+ PRIVATE OpenSSL::Crypto
+ $<$<PLATFORM_ID:Windows>:ceph-common>)
+target_include_directories(ceph_crypto_openssl PRIVATE ${OPENSSL_INCLUDE_DIR})
+add_dependencies(crypto_plugins ceph_crypto_openssl)
+set_target_properties(ceph_crypto_openssl PROPERTIES INSTALL_RPATH "")
+install(TARGETS ceph_crypto_openssl DESTINATION ${crypto_plugin_dir})
diff --git a/src/crypto/openssl/openssl_crypto_accel.cc b/src/crypto/openssl/openssl_crypto_accel.cc
new file mode 100644
index 000000000..e6ea0fa72
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_accel.cc
@@ -0,0 +1,104 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "crypto/openssl/openssl_crypto_accel.h"
+#include <openssl/evp.h>
+#include <openssl/engine.h>
+#include "common/debug.h"
+
+// -----------------------------------------------------------------------------
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_crypto
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static std::ostream&
+_prefix(std::ostream* _dout)
+{
+ return *_dout << "OpensslCryptoAccel: ";
+}
+// -----------------------------------------------------------------------------
+
+#define EVP_SUCCESS 1
+#define AES_ENCRYPT 1
+#define AES_DECRYPT 0
+
+bool evp_transform(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char* iv,
+ const unsigned char* key,
+ ENGINE* engine,
+ const EVP_CIPHER* const type,
+ const int encrypt)
+{
+ using pctx_t = std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)>;
+ pctx_t pctx{ EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free };
+
+ if (!pctx) {
+ derr << "failed to create evp cipher context" << dendl;
+ return false;
+ }
+
+ if (EVP_CipherInit_ex(pctx.get(), type, engine, key, iv, encrypt) != EVP_SUCCESS) {
+ derr << "EVP_CipherInit_ex failed" << dendl;
+ return false;
+ }
+
+ if (EVP_CIPHER_CTX_set_padding(pctx.get(), 0) != EVP_SUCCESS) {
+ derr << "failed to disable PKCS padding" << dendl;
+ return false;
+ }
+
+ int len_update = 0;
+ if (EVP_CipherUpdate(pctx.get(), out, &len_update, in, size) != EVP_SUCCESS) {
+ derr << "EVP_CipherUpdate failed" << dendl;
+ return false;
+ }
+
+ int len_final = 0;
+ if (EVP_CipherFinal_ex(pctx.get(), out + len_update, &len_final) != EVP_SUCCESS) {
+ derr << "EVP_CipherFinal_ex failed" << dendl;
+ return false;
+ }
+
+ ceph_assert(len_final == 0);
+ return (len_update + len_final) == static_cast<int>(size);
+}
+
+bool OpenSSLCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+
+ return evp_transform(out, in, size, const_cast<unsigned char*>(&iv[0]),
+ const_cast<unsigned char*>(&key[0]),
+ nullptr, // Hardware acceleration engine can be used in the future
+ EVP_aes_256_cbc(), AES_ENCRYPT);
+}
+
+bool OpenSSLCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+
+ return evp_transform(out, in, size, const_cast<unsigned char*>(&iv[0]),
+ const_cast<unsigned char*>(&key[0]),
+ nullptr, // Hardware acceleration engine can be used in the future
+ EVP_aes_256_cbc(), AES_DECRYPT);
+}
diff --git a/src/crypto/openssl/openssl_crypto_accel.h b/src/crypto/openssl/openssl_crypto_accel.h
new file mode 100644
index 000000000..ad90cbece
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_accel.h
@@ -0,0 +1,32 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef OPENSSL_CRYPTO_ACCEL_H
+#define OPENSSL_CRYPTO_ACCEL_H
+
+#include "crypto/crypto_accel.h"
+
+class OpenSSLCryptoAccel : public CryptoAccel {
+ public:
+ OpenSSLCryptoAccel() {}
+ virtual ~OpenSSLCryptoAccel() {}
+
+ bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+ bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+};
+#endif
diff --git a/src/crypto/openssl/openssl_crypto_plugin.cc b/src/crypto/openssl/openssl_crypto_plugin.cc
new file mode 100644
index 000000000..e6ecea2fd
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_plugin.cc
@@ -0,0 +1,32 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+
+#include "crypto/openssl/openssl_crypto_plugin.h"
+
+#include "ceph_ver.h"
+
+const char *__ceph_plugin_version()
+{
+ return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct,
+ const std::string& type,
+ const std::string& name)
+{
+ auto instance = cct->get_plugin_registry();
+
+ return instance->add(type, name, new OpenSSLCryptoPlugin(cct));
+}
diff --git a/src/crypto/openssl/openssl_crypto_plugin.h b/src/crypto/openssl/openssl_crypto_plugin.h
new file mode 100644
index 000000000..408d9ebda
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_plugin.h
@@ -0,0 +1,36 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef ISAL_CRYPTO_PLUGIN_H
+#define ISAL_CRYPTO_PLUGIN_H
+
+#include "crypto/crypto_plugin.h"
+#include "crypto/openssl/openssl_crypto_accel.h"
+
+
+class OpenSSLCryptoPlugin : public CryptoPlugin {
+
+ CryptoAccelRef cryptoaccel;
+public:
+ explicit OpenSSLCryptoPlugin(CephContext* cct) : CryptoPlugin(cct)
+ {}
+ int factory(CryptoAccelRef *cs, std::ostream *ss) override {
+ if (cryptoaccel == nullptr)
+ cryptoaccel = CryptoAccelRef(new OpenSSLCryptoAccel);
+
+ *cs = cryptoaccel;
+ return 0;
+ }
+};
+#endif
diff --git a/src/crypto/qat/CMakeLists.txt b/src/crypto/qat/CMakeLists.txt
new file mode 100644
index 000000000..fb751967a
--- /dev/null
+++ b/src/crypto/qat/CMakeLists.txt
@@ -0,0 +1,20 @@
+##
+# QAT wrapper for Ceph
+##
+
+set(qat_crypto_plugin_srcs
+ qat_crypto_accel.cc
+ qat_crypto_plugin.cc
+ qcccrypto.cc)
+
+add_library(ceph_crypto_qat SHARED ${qat_crypto_plugin_srcs})
+
+add_dependencies(crypto_plugins ceph_crypto_qat)
+
+target_link_libraries(ceph_crypto_qat PRIVATE
+ QatDrv::qat_s
+ QatDrv::usdm_drv_s)
+
+add_dependencies(crypto_plugins ceph_crypto_qat)
+set_target_properties(ceph_crypto_qat PROPERTIES VERSION 1.0.0 SOVERSION 1)
+install(TARGETS ceph_crypto_qat DESTINATION ${crypto_plugin_dir})
diff --git a/src/crypto/qat/qat_crypto_accel.cc b/src/crypto/qat/qat_crypto_accel.cc
new file mode 100644
index 000000000..23f86edfa
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_accel.cc
@@ -0,0 +1,42 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "crypto/qat/qat_crypto_accel.h"
+
+bool QccCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+
+ return qcccrypto.perform_op(out, in, size,
+ const_cast<unsigned char *>(&iv[0]),
+ const_cast<unsigned char *>(&key[0]), CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT);
+}
+
+bool QccCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+
+ return qcccrypto.perform_op(out, in, size,
+ const_cast<unsigned char *>(&iv[0]),
+ const_cast<unsigned char *>(&key[0]), CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT);
+}
diff --git a/src/crypto/qat/qat_crypto_accel.h b/src/crypto/qat/qat_crypto_accel.h
new file mode 100644
index 000000000..5badefc28
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_accel.h
@@ -0,0 +1,35 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef QAT_CRYPTO_ACCEL_H
+#define QAT_CRYPTO_ACCEL_H
+
+#include "crypto/crypto_accel.h"
+#include "crypto/qat/qcccrypto.h"
+
+class QccCryptoAccel : public CryptoAccel {
+ public:
+ QccCrypto qcccrypto;
+ QccCryptoAccel() { qcccrypto.init(); };
+ ~QccCryptoAccel() { qcccrypto.destroy(); };
+
+ bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+ bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+};
+#endif
diff --git a/src/crypto/qat/qat_crypto_plugin.cc b/src/crypto/qat/qat_crypto_plugin.cc
new file mode 100644
index 000000000..4bf3d61bb
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_plugin.cc
@@ -0,0 +1,35 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+
+#include "crypto/qat/qat_crypto_plugin.h"
+
+#include "ceph_ver.h"
+
+std::mutex QccCryptoPlugin::qat_init;
+
+const char *__ceph_plugin_version()
+{
+ return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct,
+ const std::string& type,
+ const std::string& name)
+{
+ PluginRegistry *instance = cct->get_plugin_registry();
+
+ return instance->add(type, name, new QccCryptoPlugin(cct));
+}
diff --git a/src/crypto/qat/qat_crypto_plugin.h b/src/crypto/qat/qat_crypto_plugin.h
new file mode 100644
index 000000000..a8d4df7cb
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_plugin.h
@@ -0,0 +1,42 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef QAT_CRYPTO_PLUGIN_H
+#define QAT_CRYPTO_PLUGIN_H
+
+#include "crypto/crypto_plugin.h"
+#include "crypto/qat/qat_crypto_accel.h"
+
+
+class QccCryptoPlugin : public CryptoPlugin {
+ static std::mutex qat_init;
+
+public:
+
+ explicit QccCryptoPlugin(CephContext* cct) : CryptoPlugin(cct)
+ {}
+ ~QccCryptoPlugin()
+ {}
+ virtual int factory(CryptoAccelRef *cs, std::ostream *ss)
+ {
+ std::lock_guard<std::mutex> l(qat_init);
+ if (cryptoaccel == nullptr)
+ cryptoaccel = CryptoAccelRef(new QccCryptoAccel);
+
+ *cs = cryptoaccel;
+ return 0;
+ }
+};
+#endif
diff --git a/src/crypto/qat/qcccrypto.cc b/src/crypto/qat/qcccrypto.cc
new file mode 100644
index 000000000..a3f253726
--- /dev/null
+++ b/src/crypto/qat/qcccrypto.cc
@@ -0,0 +1,471 @@
+#include "qcccrypto.h"
+#include <iostream>
+#include "string.h"
+#include <pthread.h>
+#include "common/debug.h"
+#include "include/scope_guard.h"
+#include "common/dout.h"
+#include "common/errno.h"
+
+// -----------------------------------------------------------------------------
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static std::ostream& _prefix(std::ostream* _dout)
+{
+ return *_dout << "QccCrypto: ";
+}
+// -----------------------------------------------------------------------------
+
+/*
+ * Poller thread & functions
+*/
+static std::mutex qcc_alloc_mutex;
+static std::mutex qcc_eng_mutex;
+static std::atomic<bool> init_called = { false };
+
+void* QccCrypto::crypt_thread(void *args) {
+ struct qcc_thread_args *thread_args = (struct qcc_thread_args *)args;
+ thread_args->qccinstance->do_crypt(thread_args);
+ return thread_args;
+}
+
+void QccCrypto::QccFreeInstance(int entry) {
+ std::lock_guard<std::mutex> freeinst(qcc_alloc_mutex);
+ open_instances.push(entry);
+}
+
+int QccCrypto::QccGetFreeInstance() {
+ int ret = -1;
+ std::lock_guard<std::mutex> getinst(qcc_alloc_mutex);
+ if (!open_instances.empty()) {
+ ret = open_instances.front();
+ open_instances.pop();
+ }
+ return ret;
+}
+
+void QccCrypto::cleanup() {
+ icp_sal_userStop();
+ qaeMemDestroy();
+ is_init = false;
+ init_stat = stat;
+ init_called = false;
+ derr << "Failure during QAT init sequence. Quitting" << dendl;
+}
+
+/*
+ * We initialize QAT instance and everything that is common for all ops
+*/
+bool QccCrypto::init()
+{
+
+ std::lock_guard<std::mutex> l(qcc_eng_mutex);
+
+ if(init_called) {
+ dout(10) << "Init sequence already called. Skipping duplicate call" << dendl;
+ return true;
+ }
+
+ // First call to init
+ dout(15) << "First init for QAT" << dendl;
+ init_called = true;
+
+ // Find if the usermode memory driver is available. We need to this to
+ // create contiguous memory needed by QAT.
+ stat = qaeMemInit();
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to load memory driver" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ stat = icp_sal_userStart("CEPH");
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to start qat device" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ qcc_os_mem_alloc((void **)&qcc_inst, sizeof(QCCINST));
+ if(qcc_inst == NULL) {
+ derr << "Unable to alloc mem for instance struct" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ // Initialize contents of qcc_inst
+ qcc_inst->num_instances = 0;
+ qcc_inst->cy_inst_handles = NULL;
+
+ stat = cpaCyGetNumInstances(&(qcc_inst->num_instances));
+ if ((stat != CPA_STATUS_SUCCESS) || (qcc_inst->num_instances <= 0)) {
+ derr << "Unable to find available instances" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ qcc_os_mem_alloc((void **)&qcc_inst->cy_inst_handles,
+ ((int)qcc_inst->num_instances * sizeof(CpaInstanceHandle)));
+ if (qcc_inst->cy_inst_handles == NULL) {
+ derr << "Unable to allocate instances array memory" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ stat = cpaCyGetInstances(qcc_inst->num_instances, qcc_inst->cy_inst_handles);
+ if (stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to get instances" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ int iter = 0;
+ //Start Instances
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ stat = cpaCyStartInstance(qcc_inst->cy_inst_handles[iter]);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to start instance" << dendl;
+ this->cleanup();
+ return false;
+ }
+ }
+
+ qcc_os_mem_alloc((void **)&qcc_inst->is_polled,
+ ((int)qcc_inst->num_instances * sizeof(CpaBoolean)));
+ CpaInstanceInfo2 info;
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ qcc_inst->is_polled[iter] = cpaCyInstanceGetInfo2(qcc_inst->cy_inst_handles[iter],
+ &info) == CPA_STATUS_SUCCESS ? info.isPolled : CPA_FALSE;
+ }
+
+ // Allocate memory structures for all instances
+ qcc_os_mem_alloc((void **)&qcc_sess,
+ ((int)qcc_inst->num_instances * sizeof(QCCSESS)));
+ if(qcc_sess == NULL) {
+ derr << "Unable to allocate memory for session struct" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ qcc_os_mem_alloc((void **)&qcc_op_mem,
+ ((int)qcc_inst->num_instances * sizeof(QCCOPMEM)));
+ if(qcc_sess == NULL) {
+ derr << "Unable to allocate memory for opmem struct" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ qcc_os_mem_alloc((void **)&cypollthreads,
+ ((int)qcc_inst->num_instances * sizeof(pthread_t)));
+ if(cypollthreads == NULL) {
+ derr << "Unable to allocate memory for pthreads" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ //At this point we are only doing an user-space version.
+ //To-Do: Maybe a kernel based one
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ stat = cpaCySetAddressTranslation(qcc_inst->cy_inst_handles[iter],
+ qaeVirtToPhysNUMA);
+ if(stat == CPA_STATUS_SUCCESS) {
+ // Start HW Polling Thread
+ // To-Do: Enable epoll & interrupt based later?
+ // QccCyStartPoll(iter);
+ // Setup the session structures for crypto operation and populate
+ // whatever we can now. Rest will be filled in when crypto operation
+ // happens.
+ qcc_sess[iter].sess_ctx_sz = 0;
+ qcc_sess[iter].sess_ctx = NULL;
+ qcc_sess[iter].sess_stp_data.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+ qcc_sess[iter].sess_stp_data.symOperation = CPA_CY_SYM_OP_CIPHER;
+ open_instances.push(iter);
+ qcc_op_mem[iter].is_mem_alloc = false;
+ qcc_op_mem[iter].op_complete = false;
+ qcc_op_mem[iter].op_result = CPA_STATUS_SUCCESS;
+ qcc_op_mem[iter].sym_op_data = NULL;
+ qcc_op_mem[iter].buff_meta_size = qcc_op_mem[iter].buff_size = 0;
+ qcc_op_mem[iter].src_buff_meta = qcc_op_mem[iter].src_buff
+ = qcc_op_mem[iter].iv_buff = NULL;
+ qcc_op_mem[iter].src_buff_list = NULL;
+ qcc_op_mem[iter].src_buff_flat = NULL;
+ qcc_op_mem[iter].num_buffers = 1;
+ } else {
+ derr << "Unable to find address translations of instance " << iter << dendl;
+ this->cleanup();
+ return false;
+ }
+ }
+ is_init = true;
+ dout(10) << "Init complete" << dendl;
+ return true;
+}
+
+bool QccCrypto::destroy() {
+ if((!is_init) || (!init_called)) {
+ dout(15) << "QAT not initialized here. Nothing to do" << dendl;
+ return false;
+ }
+
+ unsigned int retry = 0;
+ while(retry <= QCC_MAX_RETRIES) {
+ if(open_instances.size() == qcc_inst->num_instances) {
+ break;
+ } else {
+ retry++;
+ }
+ dout(5) << "QAT is still busy and cannot free resources yet" << dendl;
+ return false;
+ }
+
+ dout(10) << "Destroying QAT crypto & related memory" << dendl;
+ int iter = 0;
+
+ // Free up op related memory
+ for (iter =0; iter < qcc_inst->num_instances; iter++) {
+ qcc_contig_mem_free((void **)&(qcc_op_mem[iter].src_buff));
+ qcc_contig_mem_free((void **)&(qcc_op_mem[iter].iv_buff));
+ qcc_os_mem_free((void **)&(qcc_op_mem[iter].src_buff_list));
+ qcc_os_mem_free((void **)&(qcc_op_mem[iter].src_buff_flat));
+ qcc_contig_mem_free((void **)&(qcc_op_mem[iter].sym_op_data));
+ }
+
+ // Free up Session memory
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ cpaCySymRemoveSession(qcc_inst->cy_inst_handles[iter], qcc_sess[iter].sess_ctx);
+ qcc_contig_mem_free((void **)&(qcc_sess[iter].sess_ctx));
+ }
+
+ // Stop QAT Instances
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ cpaCyStopInstance(qcc_inst->cy_inst_handles[iter]);
+ }
+
+ // Free up the base structures we use
+ qcc_os_mem_free((void **)&qcc_op_mem);
+ qcc_os_mem_free((void **)&qcc_sess);
+ qcc_os_mem_free((void **)&(qcc_inst->cy_inst_handles));
+ qcc_os_mem_free((void **)&(qcc_inst->is_polled));
+ qcc_os_mem_free((void **)&cypollthreads);
+ qcc_os_mem_free((void **)&qcc_inst);
+
+ //Un-init memory driver and QAT HW
+ icp_sal_userStop();
+ qaeMemDestroy();
+ init_called = false;
+ is_init = false;
+ return true;
+}
+
+void QccCrypto::do_crypt(qcc_thread_args *thread_args) {
+ auto entry = thread_args->entry;
+ qcc_op_mem[entry].op_result = cpaCySymPerformOp(qcc_inst->cy_inst_handles[entry],
+ NULL,
+ qcc_op_mem[entry].sym_op_data,
+ qcc_op_mem[entry].src_buff_list,
+ qcc_op_mem[entry].src_buff_list,
+ NULL);
+ qcc_op_mem[entry].op_complete = true;
+ free(thread_args);
+}
+
+bool QccCrypto::perform_op(unsigned char* out, const unsigned char* in,
+ size_t size, uint8_t *iv, uint8_t *key, CpaCySymCipherDirection op_type)
+{
+ if (!init_called) {
+ dout(10) << "QAT not intialized yet. Initializing now..." << dendl;
+ if(!QccCrypto::init()) {
+ derr << "QAT init failed" << dendl;
+ return false;
+ }
+ }
+
+ if(!is_init)
+ {
+ dout(10) << "QAT not initialized in this instance or init failed with possible error " << (int)init_stat << dendl;
+ return is_init;
+ }
+
+ int avail_inst = -1;
+ unsigned int retrycount = 0;
+ while(retrycount <= QCC_MAX_RETRIES) {
+ avail_inst = QccGetFreeInstance();
+ if(avail_inst != -1) {
+ break;
+ } else {
+ retrycount++;
+ usleep(qcc_sleep_duration);
+ }
+ }
+
+ if(avail_inst == -1) {
+ derr << "Unable to get an QAT instance. Failing request" << dendl;
+ return false;
+ }
+
+ dout(15) << "Using inst " << avail_inst << dendl;
+ // Start polling threads for this instance
+ //QccCyStartPoll(avail_inst);
+
+ auto sg = make_scope_guard([=] {
+ //free up the instance irrespective of the op status
+ dout(15) << "Completed task under " << avail_inst << dendl;
+ qcc_op_mem[avail_inst].op_complete = false;
+ QccCrypto::QccFreeInstance(avail_inst);
+ });
+
+ /*
+ * Allocate buffers for this version of the instance if not already done.
+ * Hold onto to most of them until destructor is called.
+ */
+ if (qcc_op_mem[avail_inst].is_mem_alloc == false) {
+
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherAlgorithm =
+ CPA_CY_SYM_CIPHER_AES_CBC;
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherKeyLenInBytes =
+ AES_256_KEY_SIZE;
+
+ // Allocate contig memory for buffers that are independent of the
+ // input/output
+ stat = cpaCyBufferListGetMetaSize(qcc_inst->cy_inst_handles[avail_inst],
+ qcc_op_mem[avail_inst].num_buffers, &(qcc_op_mem[avail_inst].buff_meta_size));
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to get buff meta size" << dendl;
+ return false;
+ }
+
+ // Allocate Buffer List Private metadata
+ stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_meta),
+ qcc_op_mem[avail_inst].buff_meta_size, 1);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate private metadata memory" << dendl;
+ return false;
+ }
+
+ // Allocate Buffer List Memory
+ qcc_os_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_list), sizeof(CpaBufferList));
+ qcc_os_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_flat),
+ (qcc_op_mem[avail_inst].num_buffers * sizeof(CpaFlatBuffer)));
+ if(qcc_op_mem[avail_inst].src_buff_list == NULL || qcc_op_mem[avail_inst].src_buff_flat == NULL) {
+ derr << "Unable to allocate bufferlist memory" << dendl;
+ return false;
+ }
+
+ // Allocate IV memory
+ stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].iv_buff), AES_256_IV_LEN);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate bufferlist memory" << dendl;
+ return false;
+ }
+
+ //Assign src stuff for the operation
+ (qcc_op_mem[avail_inst].src_buff_list)->pBuffers = qcc_op_mem[avail_inst].src_buff_flat;
+ (qcc_op_mem[avail_inst].src_buff_list)->numBuffers = qcc_op_mem[avail_inst].num_buffers;
+ (qcc_op_mem[avail_inst].src_buff_list)->pPrivateMetaData = qcc_op_mem[avail_inst].src_buff_meta;
+
+ //Setup OpData
+ stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].sym_op_data),
+ sizeof(CpaCySymOpData));
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate opdata memory" << dendl;
+ return false;
+ }
+
+ // Assuming op to be encryption for initiation. This will be reset when we
+ // exit this block
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherDirection =
+ CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT;
+ // Allocate Session memory
+ stat = cpaCySymSessionCtxGetSize(qcc_inst->cy_inst_handles[avail_inst],
+ &(qcc_sess[avail_inst].sess_stp_data), &(qcc_sess[avail_inst].sess_ctx_sz));
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to find session size" << dendl;
+ return false;
+ }
+
+ stat = qcc_contig_mem_alloc((void **)&(qcc_sess[avail_inst].sess_ctx),
+ qcc_sess[avail_inst].sess_ctx_sz);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate contig memory" << dendl;
+ return false;
+ }
+
+ // Set memalloc flag so that we don't go through this exercise again.
+ qcc_op_mem[avail_inst].is_mem_alloc = true;
+ dout(15) << "Instantiation complete for " << avail_inst << dendl;
+ }
+
+ // Section that runs on every call
+ // Identify the operation and assign to session
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherDirection = op_type;
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.pCipherKey = (Cpa8U *)key;
+
+ stat = cpaCySymInitSession(qcc_inst->cy_inst_handles[avail_inst],
+ NULL,
+ &(qcc_sess[avail_inst].sess_stp_data),
+ qcc_sess[avail_inst].sess_ctx);
+ if (stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to init session" << dendl;
+ return false;
+ }
+
+ // Allocate actual buffers that will hold data
+ if (qcc_op_mem[avail_inst].buff_size != (Cpa32U)size) {
+ qcc_contig_mem_free((void **)&(qcc_op_mem[avail_inst].src_buff));
+ qcc_op_mem[avail_inst].buff_size = (Cpa32U)size;
+ stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff),
+ qcc_op_mem[avail_inst].buff_size);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate contig memory" << dendl;
+ return false;
+ }
+ }
+
+ // Copy src & iv into the respective buffers
+ memcpy(qcc_op_mem[avail_inst].src_buff, in, size);
+ memcpy(qcc_op_mem[avail_inst].iv_buff, iv, AES_256_IV_LEN);
+
+ //Assign the reminder of the stuff
+ qcc_op_mem[avail_inst].src_buff_flat->dataLenInBytes = qcc_op_mem[avail_inst].buff_size;
+ qcc_op_mem[avail_inst].src_buff_flat->pData = qcc_op_mem[avail_inst].src_buff;
+
+ //OpData assignment
+ qcc_op_mem[avail_inst].sym_op_data->sessionCtx = qcc_sess[avail_inst].sess_ctx;
+ qcc_op_mem[avail_inst].sym_op_data->packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+ qcc_op_mem[avail_inst].sym_op_data->pIv = qcc_op_mem[avail_inst].iv_buff;
+ qcc_op_mem[avail_inst].sym_op_data->ivLenInBytes = AES_256_IV_LEN;
+ qcc_op_mem[avail_inst].sym_op_data->cryptoStartSrcOffsetInBytes = 0;
+ qcc_op_mem[avail_inst].sym_op_data->messageLenToCipherInBytes = qcc_op_mem[avail_inst].buff_size;
+
+ // Perform cipher operation in a thread
+ qcc_thread_args* thread_args = new qcc_thread_args();
+ thread_args->qccinstance = this;
+ thread_args->entry = avail_inst;
+
+ if (pthread_create(&cypollthreads[avail_inst], NULL, crypt_thread, (void *)thread_args) != 0) {
+ derr << "Unable to create thread for crypt operation" << dendl;
+ return false;
+ }
+ if (qcc_inst->is_polled[avail_inst] == CPA_TRUE) {
+ while (!qcc_op_mem[avail_inst].op_complete) {
+ icp_sal_CyPollInstance(qcc_inst->cy_inst_handles[avail_inst], 0);
+ }
+ }
+ pthread_join(cypollthreads[avail_inst], NULL);
+
+ if(qcc_op_mem[avail_inst].op_result != CPA_STATUS_SUCCESS) {
+ derr << "Unable to perform crypt operation" << dendl;
+ return false;
+ }
+
+ //Copy data back to out buffer
+ memcpy(out, qcc_op_mem[avail_inst].src_buff, size);
+ //Always cleanup memory holding user-data at the end
+ memset(qcc_op_mem[avail_inst].iv_buff, 0, AES_256_IV_LEN);
+ memset(qcc_op_mem[avail_inst].src_buff, 0, qcc_op_mem[avail_inst].buff_size);
+
+ return true;
+}
diff --git a/src/crypto/qat/qcccrypto.h b/src/crypto/qat/qcccrypto.h
new file mode 100644
index 000000000..a36b0898b
--- /dev/null
+++ b/src/crypto/qat/qcccrypto.h
@@ -0,0 +1,176 @@
+#ifndef QCCCRYPTO_H
+#define QCCCRYPTO_H
+
+#include <atomic>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <queue>
+extern "C" {
+#include "cpa.h"
+#include "lac/cpa_cy_sym.h"
+#include "lac/cpa_cy_im.h"
+#include "qae_mem.h"
+#include "icp_sal_user.h"
+#include "icp_sal_poll.h"
+#include "qae_mem_utils.h"
+}
+
+class QccCrypto {
+
+ public:
+ CpaCySymCipherDirection qcc_op_type;
+
+ QccCrypto() {};
+ ~QccCrypto() {};
+
+ bool init();
+ bool destroy();
+ bool perform_op(unsigned char* out, const unsigned char* in, size_t size,
+ uint8_t *iv,
+ uint8_t *key,
+ CpaCySymCipherDirection op_type);
+
+ private:
+
+ // Currently only supporting AES_256_CBC.
+ // To-Do: Needs to be expanded
+ static const size_t AES_256_IV_LEN = 16;
+ static const size_t AES_256_KEY_SIZE = 32;
+ static const size_t QCC_MAX_RETRIES = 5000;
+
+ /*
+ * Struct to hold an instance of QAT to handle the crypto operations. These
+ * will be identified at the start and held until the destructor is called
+ * To-Do:
+ * The struct was creating assuming that we will use all the instances.
+ * Expand current implementation to allow multiple instances to operate
+ * independently.
+ */
+ struct QCCINST {
+ CpaInstanceHandle *cy_inst_handles;
+ CpaBoolean *is_polled;
+ Cpa16U num_instances;
+ } *qcc_inst;
+
+ /*
+ * QAT Crypto Session
+ * Crypto Session Context and setupdata holds
+ * priority, type of crypto operation (cipher/chained),
+ * cipher algorithm (AES, DES, etc),
+ * single crypto or multi-buffer crypto.
+ */
+ struct QCCSESS {
+ CpaCySymSessionSetupData sess_stp_data;
+ Cpa32U sess_ctx_sz;
+ CpaCySymSessionCtx sess_ctx;
+ } *qcc_sess;
+
+ /*
+ * Cipher Memory Allocations
+ * Holds bufferlist, flatbuffer, cipher opration data and buffermeta needed
+ * by QAT to perform the operation. Also buffers for IV, SRC, DEST.
+ */
+ struct QCCOPMEM {
+ // Op common items
+ bool is_mem_alloc;
+ bool op_complete;
+ CpaStatus op_result;
+ CpaCySymOpData *sym_op_data;
+ Cpa32U buff_meta_size;
+ Cpa32U num_buffers;
+ Cpa32U buff_size;
+
+ //Src data items
+ Cpa8U *src_buff_meta;
+ CpaBufferList *src_buff_list;
+ CpaFlatBuffer *src_buff_flat;
+ Cpa8U *src_buff;
+ Cpa8U *iv_buff;
+ } *qcc_op_mem;
+
+ //QAT HW polling thread input structure
+ struct qcc_thread_args {
+ QccCrypto* qccinstance;
+ int entry;
+ };
+
+
+ /*
+ * Function to handle the crypt operation. Will run while the main thread
+ * runs the polling function on the instance doing the op
+ */
+ void do_crypt(qcc_thread_args *thread_args);
+
+ /*
+ * Handle queue with free instances to handle op
+ */
+ std::queue<int> open_instances;
+ int QccGetFreeInstance();
+ void QccFreeInstance(int entry);
+
+ /*
+ * Contiguous Memory Allocator and de-allocator. We are using the usdm
+ * driver that comes along with QAT to get us direct memory access using
+ * hugepages.
+ * To-Do: A kernel based one.
+ */
+ static inline void qcc_contig_mem_free(void **ptr) {
+ if (*ptr) {
+ qaeMemFreeNUMA(ptr);
+ *ptr = NULL;
+ }
+ }
+
+ static inline CpaStatus qcc_contig_mem_alloc(void **ptr, Cpa32U size, Cpa32U alignment = 1) {
+ *ptr = qaeMemAllocNUMA(size, 0, alignment);
+ if (NULL == *ptr)
+ {
+ return CPA_STATUS_RESOURCE;
+ }
+ return CPA_STATUS_SUCCESS;
+ }
+
+ /*
+ * Malloc & free calls masked to maintain consistency and future kernel
+ * alloc support.
+ */
+ static inline void qcc_os_mem_free(void **ptr) {
+ if (*ptr) {
+ free(*ptr);
+ *ptr = NULL;
+ }
+ }
+
+ static inline CpaStatus qcc_os_mem_alloc(void **ptr, Cpa32U size) {
+ *ptr = malloc(size);
+ if (*ptr == NULL)
+ {
+ return CPA_STATUS_RESOURCE;
+ }
+ return CPA_STATUS_SUCCESS;
+ }
+
+ std::atomic<bool> is_init = { false };
+ CpaStatus init_stat, stat;
+
+ /*
+ * Function to cleanup memory if constructor fails
+ */
+ void cleanup();
+
+ /*
+ * Crypto Polling Function & helpers
+ * This helps to retrieve data from the QAT rings and dispatching the
+ * associated callbacks. For synchronous operation (like this one), QAT
+ * library creates an internal callback for the operation.
+ */
+ static void* crypt_thread(void* entry);
+ CpaStatus QccCyStartPoll(int entry);
+ void poll_instance(int entry);
+
+ pthread_t *cypollthreads;
+ static const size_t qcc_sleep_duration = 2;
+};
+#endif //QCCCRYPTO_H