summaryrefslogtreecommitdiffstats
path: root/src/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto')
-rw-r--r--src/crypto/CMakeLists.txt12
-rw-r--r--src/crypto/crypto_accel.h37
-rw-r--r--src/crypto/crypto_plugin.h36
-rw-r--r--src/crypto/isa-l/CMakeLists.txt36
-rw-r--r--src/crypto/isa-l/isa-l_crypto/LICENSE26
-rw-r--r--src/crypto/isa-l/isa-l_crypto/Makefile.am107
-rw-r--r--src/crypto/isa-l/isa-l_crypto/Makefile.nmake341
-rw-r--r--src/crypto/isa-l/isa-l_crypto/Makefile.unx41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/README.md53
-rw-r--r--src/crypto/isa-l/isa-l_crypto/Release_notes.txt139
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/Makefile.am103
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm1777
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm1747
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm1746
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm1778
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm1530
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm1505
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm1504
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm1529
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm1961
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm1895
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm1897
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm1962
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm1707
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm1652
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm1651
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm1707
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm427
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm161
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm161
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm163
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm157
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm160
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm157
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm136
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm150
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm148
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm146
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm140
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm147
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c315
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c56
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h466
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c443
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c183
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm1996
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm1990
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm2033
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm2036
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm2030
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm2074
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm163
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm172
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c272
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c71
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c1937
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c322
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h476
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm320
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm268
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm280
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm74
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h300
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c141
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c102
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c142
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c100
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c116
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c247
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c207
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c106
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h1691
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c143
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c103
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c143
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c101
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c113
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c209
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c105
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h1035
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm78
-rw-r--r--src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm78
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/autogen.sh17
-rw-r--r--src/crypto/isa-l/isa-l_crypto/configure.ac174
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h161
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h340
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h76
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/aes_xts.h214
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/datastruct.asm79
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/intrinreg.h82
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/md5_mb.h372
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/memcpy.asm346
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h363
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h315
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h327
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h127
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/multibinary.asm271
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm149
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h377
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h376
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h422
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/test.h81
-rw-r--r--src/crypto/isa-l/isa-l_crypto/include/types.h71
-rw-r--r--src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def59
-rw-r--r--src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in11
-rw-r--r--src/crypto/isa-l/isa-l_crypto/make.inc239
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c253
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm55
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm73
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm243
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm251
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm313
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm244
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c44
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c40
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm222
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm235
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm280
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm223
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c151
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c196
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c291
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c223
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c123
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm850
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm782
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm778
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm917
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c193
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am67
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c137
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c71
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm502
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm509
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm403
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c387
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm494
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c122
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h323
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c180
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c438
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c217
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c110
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c240
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c212
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am71
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c151
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c67
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm702
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm649
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm500
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm698
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c102
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h202
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm82
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c206
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c248
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c107
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c272
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c78
-rw-r--r--src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c138
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am88
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c249
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm55
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm74
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm224
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm251
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm250
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm225
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c40
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c40
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm238
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm242
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm243
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm238
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c157
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c196
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c291
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c226
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c129
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm559
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm413
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm410
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm515
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c104
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm83
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c228
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am87
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c253
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c253
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c258
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c255
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm74
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm230
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm252
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm267
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm230
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c40
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c41
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c40
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm253
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm239
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm257
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c158
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c197
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c294
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c235
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c130
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm927
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm428
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm423
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm639
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm84
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c212
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am91
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c259
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm54
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm72
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm218
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm239
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm266
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm221
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c44
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c42
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c42
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm258
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm266
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm279
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c171
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c197
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c294
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c264
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c143
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm438
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm420
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm483
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm639
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm254
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c256
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c46
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c38
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c65
-rw-r--r--src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm394
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh47
-rwxr-xr-xsrc/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh38
-rw-r--r--src/crypto/isa-l/isal_crypto_accel.cc43
-rw-r--r--src/crypto/isa-l/isal_crypto_accel.h31
-rw-r--r--src/crypto/isa-l/isal_crypto_plugin.cc34
-rw-r--r--src/crypto/isa-l/isal_crypto_plugin.h47
-rw-r--r--src/crypto/openssl/CMakeLists.txt14
-rw-r--r--src/crypto/openssl/openssl_crypto_accel.cc104
-rw-r--r--src/crypto/openssl/openssl_crypto_accel.h32
-rw-r--r--src/crypto/openssl/openssl_crypto_plugin.cc32
-rw-r--r--src/crypto/openssl/openssl_crypto_plugin.h36
-rw-r--r--src/crypto/qat/CMakeLists.txt25
-rw-r--r--src/crypto/qat/qat_crypto_accel.cc42
-rw-r--r--src/crypto/qat/qat_crypto_accel.h35
-rw-r--r--src/crypto/qat/qat_crypto_plugin.cc35
-rw-r--r--src/crypto/qat/qat_crypto_plugin.h43
-rw-r--r--src/crypto/qat/qcccrypto.cc469
-rw-r--r--src/crypto/qat/qcccrypto.h176
281 files changed, 98689 insertions, 0 deletions
diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
new file mode 100644
index 000000000..f930886d3
--- /dev/null
+++ b/src/crypto/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_custom_target(crypto_plugins)
+set(crypto_plugin_dir ${CEPH_INSTALL_PKGLIBDIR}/crypto)
+
+add_subdirectory(openssl)
+
+if(HAVE_INTEL AND HAVE_NASM_X64_AVX2 AND (NOT APPLE))
+ add_subdirectory(isa-l)
+endif()
+
+if(WITH_QAT)
+ add_subdirectory(qat)
+endif()
diff --git a/src/crypto/crypto_accel.h b/src/crypto/crypto_accel.h
new file mode 100644
index 000000000..5c1593609
--- /dev/null
+++ b/src/crypto/crypto_accel.h
@@ -0,0 +1,37 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CRYPTO_ACCEL_H
+#define CRYPTO_ACCEL_H
+#include <cstddef>
+#include "include/Context.h"
+
+class CryptoAccel;
+typedef std::shared_ptr<CryptoAccel> CryptoAccelRef;
+
+class CryptoAccel {
+ public:
+ CryptoAccel() {}
+ virtual ~CryptoAccel() {}
+
+ static const int AES_256_IVSIZE = 128/8;
+ static const int AES_256_KEYSIZE = 256/8;
+ virtual bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) = 0;
+ virtual bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) = 0;
+};
+#endif
diff --git a/src/crypto/crypto_plugin.h b/src/crypto/crypto_plugin.h
new file mode 100644
index 000000000..cf22d5cb4
--- /dev/null
+++ b/src/crypto/crypto_plugin.h
@@ -0,0 +1,36 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef CRYPTO_PLUGIN_H
+#define CRYPTO_PLUGIN_H
+
+// -----------------------------------------------------------------------------
+#include "common/PluginRegistry.h"
+#include "ostream"
+
+#include "crypto/crypto_accel.h"
+// -----------------------------------------------------------------------------
+
+class CryptoPlugin : public ceph::Plugin {
+
+public:
+ CryptoAccelRef cryptoaccel;
+ explicit CryptoPlugin(CephContext* cct) : Plugin(cct)
+ {}
+ ~CryptoPlugin()
+ {}
+ virtual int factory(CryptoAccelRef *cs,
+ std::ostream *ss) = 0;
+};
+#endif
diff --git a/src/crypto/isa-l/CMakeLists.txt b/src/crypto/isa-l/CMakeLists.txt
new file mode 100644
index 000000000..2a2ec0bc0
--- /dev/null
+++ b/src/crypto/isa-l/CMakeLists.txt
@@ -0,0 +1,36 @@
+set(isal_dir ${CMAKE_SOURCE_DIR}/src/crypto/isa-l/isa-l_crypto)
+set(CMAKE_ASM_FLAGS "-i ${isal_dir}/aes/ -i ${isal_dir}/include/ ${CMAKE_ASM_FLAGS}")
+
+set(isal_crypto_plugin_srcs
+ isal_crypto_accel.cc
+ isal_crypto_plugin.cc
+ ${isal_dir}/aes/cbc_pre.c
+ ${isal_dir}/aes/cbc_multibinary.asm
+ ${isal_dir}/aes/keyexp_128.asm
+ ${isal_dir}/aes/keyexp_192.asm
+ ${isal_dir}/aes/keyexp_256.asm
+ ${isal_dir}/aes/keyexp_multibinary.asm
+ ${isal_dir}/aes/cbc_dec_128_x4_sse.asm
+ ${isal_dir}/aes/cbc_dec_128_x8_avx.asm
+ ${isal_dir}/aes/cbc_dec_192_x4_sse.asm
+ ${isal_dir}/aes/cbc_dec_192_x8_avx.asm
+ ${isal_dir}/aes/cbc_dec_256_x4_sse.asm
+ ${isal_dir}/aes/cbc_dec_256_x8_avx.asm
+ ${isal_dir}/aes/cbc_enc_128_x4_sb.asm
+ ${isal_dir}/aes/cbc_enc_128_x8_sb.asm
+ ${isal_dir}/aes/cbc_enc_192_x4_sb.asm
+ ${isal_dir}/aes/cbc_enc_192_x8_sb.asm
+ ${isal_dir}/aes/cbc_enc_256_x4_sb.asm
+ ${isal_dir}/aes/cbc_enc_256_x8_sb.asm)
+
+if(HAVE_NASM_X64)
+add_dependencies(crypto_plugins ceph_crypto_isal)
+endif(HAVE_NASM_X64)
+
+add_library(ceph_crypto_isal SHARED ${isal_crypto_plugin_srcs})
+target_include_directories(ceph_crypto_isal PRIVATE ${isal_dir}/include)
+set_target_properties(ceph_crypto_isal PROPERTIES
+ VERSION 1.0.0
+ SOVERSION 1
+ INSTALL_RPATH "")
+install(TARGETS ceph_crypto_isal DESTINATION ${crypto_plugin_dir})
diff --git a/src/crypto/isa-l/isa-l_crypto/LICENSE b/src/crypto/isa-l/isa-l_crypto/LICENSE
new file mode 100644
index 000000000..b0bdbf662
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/LICENSE
@@ -0,0 +1,26 @@
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.am b/src/crypto/isa-l/isa-l_crypto/Makefile.am
new file mode 100644
index 000000000..4c0b58d31
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Makefile.am
@@ -0,0 +1,107 @@
+EXTRA_DIST = autogen.sh Makefile.unx make.inc Makefile.nmake isa-l_crypto.def LICENSE
+CLEANFILES =
+LDADD =
+AM_MAKEFLAGS = --no-print-directory
+noinst_HEADERS =
+pkginclude_HEADERS = include/test.h include/types.h
+noinst_LTLIBRARIES =
+INCLUDE = -I $(srcdir)/include/
+AM_CFLAGS = ${my_CFLAGS} ${INCLUDE} ${D}
+
+lsrc=
+extern_hdrs=
+other_src=
+check_tests=
+unit_tests=
+perf_tests=
+unit_tests_extra=
+perf_tests_extra=
+examples=
+other_tests=
+lsrc32=
+unit_tests32=
+perf_tests32=
+
+# Include units
+include md5_mb/Makefile.am
+include sha1_mb/Makefile.am
+include sha256_mb/Makefile.am
+include sha512_mb/Makefile.am
+include mh_sha1/Makefile.am
+include mh_sha1_murmur3_x64_128/Makefile.am
+include aes/Makefile.am
+
+# LIB version info not necessarily the same as package version
+LIBISAL_CURRENT=2
+LIBISAL_REVISION=16
+LIBISAL_AGE=0
+
+lib_LTLIBRARIES = libisal_crypto.la
+pkginclude_HEADERS += $(sort ${extern_hdrs})
+libisal_crypto_la_SOURCES = ${lsrc}
+nobase_include_HEADERS = isa-l_crypto.h
+libisal_crypto_la_LDFLAGS = $(AM_LDFLAGS) \
+ -version-info $(LIBISAL_CURRENT):$(LIBISAL_REVISION):$(LIBISAL_AGE)
+libisal_crypto_la_LIBADD = ${noinst_LTLIBRARIES}
+
+EXTRA_DIST += ${other_src}
+EXTRA_DIST += Release_notes.txt
+
+# For tests
+LDADD += libisal_crypto.la
+check_PROGRAMS = ${check_tests}
+TESTS = ${check_tests}
+
+# For additional tests
+EXTRA_PROGRAMS = ${unit_tests}
+EXTRA_PROGRAMS += ${perf_tests}
+EXTRA_PROGRAMS += ${other_tests}
+EXTRA_PROGRAMS += ${examples}
+CLEANFILES += ${EXTRA_PROGRAMS}
+
+perfs: ${perf_tests}
+tests: ${unit_tests}
+other: ${other_tests}
+perf: $(addsuffix .run,$(perf_tests))
+ex: ${examples}
+test: $(addsuffix .run,$(unit_tests))
+
+# Build rule to run tests
+%.run: %
+ $<
+ @echo Completed run: $<
+
+# Support for yasm/nasm
+if USE_YASM
+ as_filter = ${srcdir}/tools/yasm-filter.sh
+endif
+if USE_NASM
+ as_filter = ${srcdir}/tools/nasm-filter.sh
+ as_include = $(addprefix -I ${srcdir}/,$(sort $(dir $(lsrc))))
+endif
+
+CCAS = $(as_filter)
+EXTRA_DIST += tools/yasm-filter.sh tools/nasm-filter.sh
+AM_CCASFLAGS = ${yasm_args} ${INCLUDE} ${as_include} ${DEFS} ${D}
+
+.asm.s:
+ @echo " MKTMP " $@;
+ @cp $< $@
+
+# Generate isa-l_crypto.h
+BUILT_SOURCES = isa-l_crypto.h
+CLEANFILES += isa-l_crypto.h
+isa-l_crypto.h:
+ @echo 'Building $@'
+ @echo '' >> $@
+ @echo '#ifndef _ISAL_CRYPTO_H_' >> $@
+ @echo '#define _ISAL_CRYPTO_H_' >> $@
+ @echo '' >> $@
+ @echo '#define.ISAL_CRYPTO_MAJOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$3}' >> $@
+ @echo '#define.ISAL_CRYPTO_MINOR_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$4}' >> $@
+ @echo '#define.ISAL_CRYPTO_PATCH_VERSION.${VERSION}' | ${AWK} -F . '{print $$1, $$2, $$5}' >> $@
+ @echo '#define ISAL_CRYPTO_MAKE_VERSION(maj, min, patch) ((maj) * 0x10000 + (min) * 0x100 + (patch))' >> $@
+ @echo '#define ISAL_CRYPTO_VERSION ISAL_CRYPTO_MAKE_VERSION(ISAL_CRYPTO_MAJOR_VERSION, ISAL_CRYPTO_MINOR_VERSION, ISAL_CRYPTO_PATCH_VERSION)' >> $@
+ @echo '' >> $@
+ @for unit in $(sort $(extern_hdrs)); do echo "#include <isa-l_crypto/$$unit>" | sed -e 's;include/;;' >> $@; done
+ @echo '#endif //_ISAL_CRYPTO_H_' >> $@
diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.nmake b/src/crypto/isa-l/isa-l_crypto/Makefile.nmake
new file mode 100644
index 000000000..15ce37820
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Makefile.nmake
@@ -0,0 +1,341 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+objs = \
+ bin\md5_ctx_avx.obj \
+ bin\md5_ctx_avx2.obj \
+ bin\md5_ctx_avx512.obj \
+ bin\md5_ctx_sse.obj \
+ bin\md5_mb_mgr_flush_avx.obj \
+ bin\md5_mb_mgr_flush_avx2.obj \
+ bin\md5_mb_mgr_flush_avx512.obj \
+ bin\md5_mb_mgr_flush_sse.obj \
+ bin\md5_mb_mgr_init_avx2.obj \
+ bin\md5_mb_mgr_init_avx512.obj \
+ bin\md5_mb_mgr_init_sse.obj \
+ bin\md5_mb_mgr_submit_avx.obj \
+ bin\md5_mb_mgr_submit_avx2.obj \
+ bin\md5_mb_mgr_submit_avx512.obj \
+ bin\md5_mb_mgr_submit_sse.obj \
+ bin\md5_mb_x16x2_avx512.obj \
+ bin\md5_mb_x4x2_avx.obj \
+ bin\md5_mb_x4x2_sse.obj \
+ bin\md5_mb_x8x2_avx2.obj \
+ bin\md5_multibinary.obj \
+ bin\sha1_ctx_avx.obj \
+ bin\sha1_ctx_avx2.obj \
+ bin\sha1_ctx_avx512.obj \
+ bin\sha1_ctx_sse.obj \
+ bin\sha1_mb_mgr_flush_avx.obj \
+ bin\sha1_mb_mgr_flush_avx2.obj \
+ bin\sha1_mb_mgr_flush_avx512.obj \
+ bin\sha1_mb_mgr_flush_sse.obj \
+ bin\sha1_mb_mgr_init_avx2.obj \
+ bin\sha1_mb_mgr_init_avx512.obj \
+ bin\sha1_mb_mgr_init_sse.obj \
+ bin\sha1_mb_mgr_submit_avx.obj \
+ bin\sha1_mb_mgr_submit_avx2.obj \
+ bin\sha1_mb_mgr_submit_avx512.obj \
+ bin\sha1_mb_mgr_submit_sse.obj \
+ bin\sha1_mb_x16_avx512.obj \
+ bin\sha1_mb_x4_avx.obj \
+ bin\sha1_mb_x4_sse.obj \
+ bin\sha1_mb_x8_avx2.obj \
+ bin\sha1_multibinary.obj \
+ bin\sha256_ctx_avx.obj \
+ bin\sha256_ctx_avx2.obj \
+ bin\sha256_ctx_avx512.obj \
+ bin\sha256_ctx_sse.obj \
+ bin\sha256_mb_mgr_flush_avx.obj \
+ bin\sha256_mb_mgr_flush_avx2.obj \
+ bin\sha256_mb_mgr_flush_avx512.obj \
+ bin\sha256_mb_mgr_flush_sse.obj \
+ bin\sha256_mb_mgr_init_avx2.obj \
+ bin\sha256_mb_mgr_init_avx512.obj \
+ bin\sha256_mb_mgr_init_sse.obj \
+ bin\sha256_mb_mgr_submit_avx.obj \
+ bin\sha256_mb_mgr_submit_avx2.obj \
+ bin\sha256_mb_mgr_submit_avx512.obj \
+ bin\sha256_mb_mgr_submit_sse.obj \
+ bin\sha256_mb_x16_avx512.obj \
+ bin\sha256_mb_x4_avx.obj \
+ bin\sha256_mb_x4_sse.obj \
+ bin\sha256_mb_x8_avx2.obj \
+ bin\sha256_multibinary.obj \
+ bin\sha512_ctx_avx.obj \
+ bin\sha512_ctx_avx2.obj \
+ bin\sha512_ctx_avx512.obj \
+ bin\sha512_ctx_sb_sse4.obj \
+ bin\sha512_ctx_sse.obj \
+ bin\sha512_mb_mgr_flush_avx.obj \
+ bin\sha512_mb_mgr_flush_avx2.obj \
+ bin\sha512_mb_mgr_flush_avx512.obj \
+ bin\sha512_mb_mgr_flush_sse.obj \
+ bin\sha512_mb_mgr_init_avx2.obj \
+ bin\sha512_mb_mgr_init_avx512.obj \
+ bin\sha512_mb_mgr_init_sse.obj \
+ bin\sha512_mb_mgr_submit_avx.obj \
+ bin\sha512_mb_mgr_submit_avx2.obj \
+ bin\sha512_mb_mgr_submit_avx512.obj \
+ bin\sha512_mb_mgr_submit_sse.obj \
+ bin\sha512_mb_x2_avx.obj \
+ bin\sha512_mb_x2_sse.obj \
+ bin\sha512_mb_x4_avx2.obj \
+ bin\sha512_mb_x8_avx512.obj \
+ bin\sha512_multibinary.obj \
+ bin\sha512_sb_mgr_flush_sse4.obj \
+ bin\sha512_sb_mgr_init_sse4.obj \
+ bin\sha512_sb_mgr_submit_sse4.obj \
+ bin\sha512_sse4.obj \
+ bin\XTS_AES_128_dec_avx.obj \
+ bin\XTS_AES_128_dec_expanded_key_avx.obj \
+ bin\XTS_AES_128_dec_expanded_key_sse.obj \
+ bin\XTS_AES_128_dec_sse.obj \
+ bin\XTS_AES_128_enc_avx.obj \
+ bin\XTS_AES_128_enc_expanded_key_avx.obj \
+ bin\XTS_AES_128_enc_expanded_key_sse.obj \
+ bin\XTS_AES_128_enc_sse.obj \
+ bin\XTS_AES_256_dec_avx.obj \
+ bin\XTS_AES_256_dec_expanded_key_avx.obj \
+ bin\XTS_AES_256_dec_expanded_key_sse.obj \
+ bin\XTS_AES_256_dec_sse.obj \
+ bin\XTS_AES_256_enc_avx.obj \
+ bin\XTS_AES_256_enc_expanded_key_avx.obj \
+ bin\XTS_AES_256_enc_expanded_key_sse.obj \
+ bin\XTS_AES_256_enc_sse.obj \
+ bin\cbc_dec_128_x4_sse.obj \
+ bin\cbc_dec_128_x8_avx.obj \
+ bin\cbc_dec_192_x4_sse.obj \
+ bin\cbc_dec_192_x8_avx.obj \
+ bin\cbc_dec_256_x4_sse.obj \
+ bin\cbc_dec_256_x8_avx.obj \
+ bin\cbc_enc_128_x4_sb.obj \
+ bin\cbc_enc_128_x8_sb.obj \
+ bin\cbc_enc_192_x4_sb.obj \
+ bin\cbc_enc_192_x8_sb.obj \
+ bin\cbc_enc_256_x4_sb.obj \
+ bin\cbc_enc_256_x8_sb.obj \
+ bin\cbc_multibinary.obj \
+ bin\cbc_pre.obj \
+ bin\gcm128_avx_gen2.obj \
+ bin\gcm128_avx_gen4.obj \
+ bin\gcm128_sse.obj \
+ bin\gcm256_avx_gen2.obj \
+ bin\gcm256_avx_gen4.obj \
+ bin\gcm256_sse.obj \
+ bin\gcm_multibinary.obj \
+ bin\gcm_pre.obj \
+ bin\keyexp_128.obj \
+ bin\keyexp_192.obj \
+ bin\keyexp_256.obj \
+ bin\keyexp_multibinary.obj \
+ bin\mh_sha1.obj \
+ bin\sha1_for_mh_sha1.obj \
+ bin\mh_sha1_avx512.obj \
+ bin\mh_sha1_block_avx.obj \
+ bin\mh_sha1_block_avx2.obj \
+ bin\mh_sha1_block_avx512.obj \
+ bin\mh_sha1_block_base.obj \
+ bin\mh_sha1_block_sse.obj \
+ bin\mh_sha1_finalize_base.obj \
+ bin\mh_sha1_multibinary.obj \
+ bin\mh_sha1_murmur3_x64_128.obj \
+ bin\mh_sha1_murmur3_x64_128_avx512.obj \
+ bin\mh_sha1_murmur3_x64_128_block_avx.obj \
+ bin\mh_sha1_murmur3_x64_128_block_avx2.obj \
+ bin\mh_sha1_murmur3_x64_128_block_avx512.obj \
+ bin\mh_sha1_murmur3_x64_128_block_sse.obj \
+ bin\mh_sha1_murmur3_x64_128_finalize_base.obj \
+ bin\mh_sha1_murmur3_x64_128_multibinary.obj \
+ bin\mh_sha1_murmur3_x64_128_update_base.obj \
+ bin\mh_sha1_update_base.obj \
+ bin\murmur3_x64_128_internal.obj \
+ bin\xts_aes_128_multibinary.obj \
+ bin\xts_aes_256_multibinary.obj
+
+INCLUDES = -I./ -Isha1_mb/ -Isha256_mb/ -Isha512_mb/ -Imd5_mb/ -Imh_sha1/ -Imh_sha1_murmur3_x64_128 -Iaes/ -Iinclude/
+LINKFLAGS = /nologo
+INCLUDES = $(INCLUDES) -Ic:\openssl\include
+LINKFLAGS = $(LINKFLAGS) /libpath:c:\openssl\lib
+CFLAGS = -O2 -D NDEBUG /nologo -D_USE_MATH_DEFINES -Qstd=c99 $(INCLUDES) $(D)
+AFLAGS = -f win64 $(INCLUDES) $(D)
+CC = icl
+AS = yasm
+
+lib: bin static dll
+static: bin isa-l_crypto_static.lib
+dll: bin isa-l_crypto.dll
+
+bin: ; -mkdir $@
+
+isa-l_crypto_static.lib: $(objs)
+ lib -out:$@ $?
+
+isa-l_crypto.dll: $(objs)
+ link -out:$@ -dll -def:isa-l_crypto.def $?
+
+{sha1_mb}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{sha1_mb}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{sha256_mb}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{sha256_mb}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{sha512_mb}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{sha512_mb}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{md5_mb}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{md5_mb}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{mh_sha1}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{mh_sha1}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{mh_sha1_murmur3_x64_128}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{mh_sha1_murmur3_x64_128}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+{aes}.c.obj:
+ $(CC) $(CFLAGS) /c -Fo$@ $?
+{aes}.asm.obj:
+ $(AS) $(AFLAGS) -o $@ $?
+
+
+# Examples
+ex = sha1_multi_buffer_example.exe
+ex: lib $(ex)
+
+$(ex): $(@B).obj
+
+.obj.exe:
+ link /out:$@ $(LINKFLAGS) isa-l_crypto.lib $?
+
+# Check tests
+checks = \
+ md5_mb_rand_test.exe \
+ md5_mb_rand_update_test.exe \
+ md5_mb_test.exe \
+ sha1_mb_rand_test.exe \
+ sha1_mb_rand_update_test.exe \
+ sha1_mb_test.exe \
+ sha256_mb_rand_test.exe \
+ sha256_mb_rand_update_test.exe \
+ sha256_mb_test.exe \
+ sha512_mb_rand_test.exe \
+ sha512_mb_rand_update_test.exe \
+ sha512_mb_test.exe \
+ mh_sha1_test.exe \
+ cbc_std_vectors_test.exe \
+ gcm_std_vectors_test.exe \
+ xts_128_expanded_key_test.exe \
+ xts_128_test.exe \
+ xts_256_expanded_key_test.exe \
+ xts_256_test.exe
+
+
+checks: lib $(checks)
+$(checks): $(@B).obj
+check: $(checks)
+ !$?
+
+# Unit tests
+tests = \
+ md5_mb_rand_ssl_test.exe \
+ sha1_mb_rand_ssl_test.exe \
+ sha256_mb_rand_ssl_test.exe \
+ sha512_mb_rand_ssl_test.exe \
+ cbc_std_vectors_random_test.exe \
+ gcm_std_vectors_random_test.exe \
+ xts_128_rand.exe \
+ xts_128_rand_ossl_test.exe \
+ xts_256_rand.exe \
+ xts_256_rand_ossl_test.exe \
+ mh_sha1_update_test.exe \
+ mh_sha1_murmur3_x64_128_test.exe \
+ mh_sha1_murmur3_x64_128_update_test.exe
+
+tests: lib $(tests)
+$(tests): $(@B).obj
+
+# Performance tests
+perfs = md5_mb_vs_ossl_perf.exe sha1_mb_vs_ossl_perf.exe sha256_mb_vs_ossl_perf.exe sha512_mb_vs_ossl_perf.exe cbc_ossl_perf.exe gcm_ossl_perf.exe xts_128_dec_ossl_perf.exe xts_128_dec_perf.exe xts_128_enc_ossl_perf.exe xts_128_enc_perf.exe xts_256_dec_ossl_perf.exe xts_256_dec_perf.exe xts_256_enc_ossl_perf.exe xts_256_enc_perf.exe md5_mb_vs_ossl_perf.exe mh_sha1_perf.exe mh_sha1_murmur3_x64_128_perf.exe
+
+perfs: lib $(perfs)
+$(perfs): $(@B).obj
+
+clean:
+ -if exist *.obj del *.obj
+ -if exist bin\*.obj del bin\*.obj
+ -if exist isa-l_crypto_static.lib del isa-l_crypto_static.lib
+ -if exist *.exe del *.exe
+ -if exist isa-l_crypto.lib del isa-l_crypto.lib
+ -if exist isa-l_crypto.dll del isa-l_crypto.dll
+
+libeay32.lib:
+sha1_mb_rand_test.exe: sha1_ref.obj
+sha1_mb_rand_update_test.exe: sha1_ref.obj
+sha1_mb_rand_ssl_test.exe: libeay32.lib
+sha1_mb_vs_ossl_perf.exe: libeay32.lib
+sha256_mb_rand_ssl_test.exe: sha256_ref.obj
+sha256_mb_rand_test.exe: sha256_ref.obj
+sha256_mb_rand_update_test.exe: sha256_ref.obj
+sha256_mb_rand_ssl_test.exe: libeay32.lib
+sha256_mb_vs_ossl_perf.exe: libeay32.lib
+sha512_mb_rand_test.exe: sha512_ref.obj
+sha512_mb_rand_update_test.exe: sha512_ref.obj
+sha512_mb_rand_ssl_test.exe: libeay32.lib
+sha512_mb_vs_ossl_perf.exe: libeay32.lib
+md5_mb_rand_test.exe: md5_ref.obj
+md5_mb_rand_update_test.exe: md5_ref.obj
+md5_mb_rand_ssl_test.exe: libeay32.lib
+md5_mb_vs_ossl_perf.exe: libeay32.lib
+mh_sha1_test.exe: mh_sha1_ref.obj
+mh_sha1_update_test.exe: mh_sha1_ref.obj
+mh_sha1_murmur3_x64_128_test.exe: mh_sha1_ref.obj murmur3_x64_128.obj
+mh_sha1_murmur3_x64_128_update_test.exe: mh_sha1_ref.obj murmur3_x64_128.obj
+mh_sha1_murmur3_x64_128_perf.exe: mh_sha1_ref.obj murmur3_x64_128.obj
+cbc_ossl_perf.exe: libeay32.lib
+cbc_std_vectors_random_test.exe: libeay32.lib
+gcm_ossl_perf.exe: libeay32.lib
+gcm_std_vectors_random_test.exe: libeay32.lib
+xts_128_enc_ossl_perf.exe: libeay32.lib
+xts_128_dec_ossl_perf.exe: libeay32.lib
+xts_128_rand_ossl_test.exe: libeay32.lib
+xts_256_enc_ossl_perf.exe: libeay32.lib
+xts_256_dec_ossl_perf.exe: libeay32.lib
+xts_256_rand_ossl_test.exe: libeay32.lib
diff --git a/src/crypto/isa-l/isa-l_crypto/Makefile.unx b/src/crypto/isa-l/isa-l_crypto/Makefile.unx
new file mode 100644
index 000000000..f1934e8be
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Makefile.unx
@@ -0,0 +1,41 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+units = sha1_mb sha256_mb sha512_mb md5_mb mh_sha1 mh_sha1_murmur3_x64_128 aes
+
+default: lib
+
+include $(foreach unit,$(units), $(unit)/Makefile.am)
+
+# Override individual lib names to make one inclusive library.
+lib_name := bin/isa-l_crypto.a
+
+include make.inc
+
+VPATH = . $(units) include
diff --git a/src/crypto/isa-l/isa-l_crypto/README.md b/src/crypto/isa-l/isa-l_crypto/README.md
new file mode 100644
index 000000000..5703a3111
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/README.md
@@ -0,0 +1,53 @@
+================================================================
+Intel(R) Intelligent Storage Acceleration Library Crypto Version
+================================================================
+
+ISA-L_crypto is a collection of optimized low-level functions targeting storage
+applications. ISA-L_crypto includes:
+
+* Multi-buffer hashes - run multiple hash jobs together on one core for much
+ better throughput than single-buffer versions.
+ - SHA1, SHA256, SHA512, MD5
+
+* Multi-hash - Get the performance of multi-buffer hashing with a single-buffer
+ interface.
+
+* Multi-hash + murmur - run both together.
+
+* AES - block ciphers
+ - XTS, GCM, CBC
+
+See [ISA-L_crypto for updates.](https://github.com/01org/isa-l_crypto)
+For non-crypto ISA-L see [isa-l on github.](https://github.com/01org/isa-l)
+
+Build Prerequisites
+===================
+
+ISA-L requires yasm version 1.2.0 or later or nasm v2.11.01 or later. Building
+with autotools requires autoconf/automake packages.
+
+Building ISA-L
+==============
+
+Autotools
+---------
+
+To build and install the library with autotools it is usually sufficient to run
+the following:
+
+ ./autogen.sh
+ ./configure
+ make
+ sudo make install
+
+Other targets include: make check, make tests, make perfs, make ex (examples)
+and make other.
+
+Windows
+-------
+
+On Windows use nmake to build dll and static lib:
+
+ nmake -f Makefile.nmake
+
+Other targets include: nmake check.
diff --git a/src/crypto/isa-l/isa-l_crypto/Release_notes.txt b/src/crypto/isa-l/isa-l_crypto/Release_notes.txt
new file mode 100644
index 000000000..836d60ecb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/Release_notes.txt
@@ -0,0 +1,139 @@
+================================================================================
+v2.16 Intel Intelligent Storage Acceleration Library Crypto Release Notes
+================================================================================
+
+================================================================================
+RELEASE NOTE CONTENTS
+================================================================================
+1. KNOWN ISSUES
+2. FIXED ISSUES
+3. CHANGE LOG & FEATURES ADDED
+
+================================================================================
+1. KNOWN ISSUES
+================================================================================
+
+* Perf tests do not run in Windows environment.
+
+* 32-bit lib is not supported in Windows.
+
+================================================================================
+2. FIXED ISSUES
+================================================================================
+v2.14
+
+* Building in unit directories is no longer supported removing the issue of
+ leftover object files causing the top-level make build to fail.
+
+v2.9
+
+* Multi-buffer MD5 AVX2 tests fixed to work on FreeBSD 9.1 by explicitly aligning
+ structures.
+
+v2.7
+
+* Unit tests and examples are now supported in Windows environment
+
+
+================================================================================
+3. CHANGE LOG & FEATURES ADDED
+================================================================================
+v2.16
+
+* Split lib from non-crypto functions.
+
+
+v2.15
+
+* Multi-buffer hash updates. New AVX512 versions for multi-buffer SHA1, SHA256,
+ SHA512, MD5 and SHA1+murmur3_x64_128 stitched.
+
+* Removes restrictions on AAD length in AES-GCM. Previously AAD length was
+ limited to a multiple of 4 bytes. Now any AAD length is allowed.
+
+* Nasm support. ISA-L ported to build with nasm or yasm assembler.
+
+* Windows DLL support. Windows builds DLL by default.
+
+* The older, deprecated multi-buffer API has been removed.
+
+v2.14
+
+* New multi-hash sha1 function and multi-hash sha1 + murmur3_x64_128 stitched.
+ Multi-hash is designed to give the performance of multi-buffer cryptographic
+ hashes with a synchronous single buffer interface.
+
+* New AES-GCM and AES-CBC functions added.
+
+* Autoconf and autotools build allows easier porting to additional systems.
+ Previous make system still available to embedded users with Makefile.unx.
+
+* The AES key expand functions that were used for AES-XTS with pre-expanded keys
+ now expand the decrypt keys in a different order. The order that decrypt keys
+ are stored and used by XTS_AES_128_dec_expanded_key() is reversed from
+ previous versions to be compatable with CBC and GCM key expansion. The
+ aes_keyexp_*() and XTS_AES_128_dec_expanded_key() functions should work the
+ same when paired together.
+
+* Includes update for building on Mac OS X/darwin systems. Add --target=darwin
+ to ./configure step.
+
+v2.10
+
+* Added multi-buffer MD5 in the new hash API. Includes multi-binary capability,
+ no restriction on update length and other benefits of the CTX API.
+
+v2.9
+
+* New multi-buffer hash API. The new API brings the following new features to
+ multi-buffer hashes. The older API is still included but may be deprecated in
+ future releases.
+
+ - Multibinary functionality. Call one function and the appropriate
+ architecture-specific version is fixed up at runtime.
+
+ - No restriction on update length. Submitting an update block no longer has
+ to have length a multiple of the fundamental block size.
+
+* New expanded key tests added for AES-XTS 128 and 256
+
+v2.7
+
+* New AVX2 versions for mb_md5 and mb_sha512 hashing code have been added.
+
+v2.6
+
+* Update buffer functionality added to mb_md5, mb_sha256 and mb_sha512 hashing
+ code. Requires API changes to current interface to specify job type and total
+ length of hash.
+
+* New AVX2 versions for mb_sha1 and mb_sha256 hashing code have been added.
+
+v2.5
+
+* New feature for multi-buffer SHA-1, update buffer. mb_sha1 non-finalize jobs
+ can now be submitted by setting flags in job structure. Requires API changes
+ to current interface to specify job type and total length of hash.
+
+v2.4
+
+* Added new multi-buffer SHA-512: mb_sha512. SSE, AVX versions.
+
+v2.3
+
+* Added improved AES XTS versions.
+
+v2.2
+
+* Added new AVX versions of multi-buffer hashes
+* Changed type in the interface struct for multi-buffer hashes
+ the len field in the following structures :JOB_SHA1,JOB_MD5,
+ JOB_SHA256 is now a 32-bit int.
+
+v2.0
+
+* Added AES XTS units aes_xts_128, aes_xts_256
+
+v1.3
+
+* Added new multi-buffer units for SHA-256 and MD5: mb_sha256, mb_md5.
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am
new file mode 100644
index 000000000..9ae1f0cb4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/Makefile.am
@@ -0,0 +1,103 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+extern_hdrs += include/aes_gcm.h include/aes_cbc.h include/aes_xts.h include/aes_keyexp.h
+
+lsrc += aes/gcm_multibinary.asm aes/gcm_pre.c
+lsrc += aes/gcm128_avx_gen2.asm aes/gcm128_avx_gen4.asm aes/gcm128_sse.asm
+lsrc += aes/gcm256_avx_gen2.asm aes/gcm256_avx_gen4.asm aes/gcm256_sse.asm
+lsrc += aes/keyexp_multibinary.asm
+lsrc += aes/keyexp_128.asm aes/keyexp_192.asm aes/keyexp_256.asm
+lsrc += aes/cbc_multibinary.asm
+lsrc += aes/cbc_dec_128_x4_sse.asm aes/cbc_dec_128_x8_avx.asm
+lsrc += aes/cbc_dec_192_x4_sse.asm aes/cbc_dec_192_x8_avx.asm
+lsrc += aes/cbc_dec_256_x4_sse.asm aes/cbc_dec_256_x8_avx.asm
+lsrc += aes/cbc_enc_128_x4_sb.asm aes/cbc_enc_128_x8_sb.asm
+lsrc += aes/cbc_enc_192_x4_sb.asm aes/cbc_enc_192_x8_sb.asm
+lsrc += aes/cbc_enc_256_x4_sb.asm aes/cbc_enc_256_x8_sb.asm
+lsrc += aes/cbc_pre.c
+lsrc += aes/xts_aes_128_multibinary.asm
+lsrc += aes/XTS_AES_128_dec_sse.asm aes/XTS_AES_128_dec_expanded_key_sse.asm
+lsrc += aes/XTS_AES_128_enc_sse.asm aes/XTS_AES_128_enc_expanded_key_sse.asm
+lsrc += aes/XTS_AES_128_dec_avx.asm aes/XTS_AES_128_dec_expanded_key_avx.asm
+lsrc += aes/XTS_AES_128_enc_avx.asm aes/XTS_AES_128_enc_expanded_key_avx.asm
+lsrc += aes/xts_aes_256_multibinary.asm
+lsrc += aes/XTS_AES_256_dec_avx.asm aes/XTS_AES_256_dec_expanded_key_avx.asm
+lsrc += aes/XTS_AES_256_enc_avx.asm aes/XTS_AES_256_enc_expanded_key_avx.asm
+lsrc += aes/XTS_AES_256_dec_sse.asm aes/XTS_AES_256_dec_expanded_key_sse.asm
+lsrc += aes/XTS_AES_256_enc_sse.asm aes/XTS_AES_256_enc_expanded_key_sse.asm
+
+other_src += include/multibinary.asm
+other_src += include/test.h include/types.h include/reg_sizes.asm
+other_src += aes/gcm_defines.asm
+other_src += aes/cbc_common.asm aes/cbc_std_vectors.h
+other_src += aes/gcm_vectors.h aes/ossl_helper.h
+other_src += aes/xts_128_vect.h
+other_src += aes/xts_256_vect.h
+
+check_tests += aes/cbc_std_vectors_test
+check_tests += aes/gcm_std_vectors_test
+check_tests += aes/xts_128_test
+check_tests += aes/xts_256_test
+check_tests += aes/xts_128_expanded_key_test
+check_tests += aes/xts_256_expanded_key_test
+
+unit_tests += aes/cbc_std_vectors_random_test
+unit_tests += aes/gcm_std_vectors_random_test
+unit_tests += aes/xts_128_rand aes/xts_128_rand_ossl_test
+unit_tests += aes/xts_256_rand aes/xts_256_rand_ossl_test
+
+perf_tests += aes/cbc_ossl_perf
+perf_tests += aes/gcm_ossl_perf
+perf_tests += aes/xts_128_enc_ossl_perf
+perf_tests += aes/xts_256_enc_ossl_perf
+perf_tests += aes/xts_128_enc_perf aes/xts_128_dec_perf aes/xts_128_dec_ossl_perf
+perf_tests += aes/xts_256_enc_perf aes/xts_256_dec_perf aes/xts_256_dec_ossl_perf
+
+
+cbc_ossl_perf: LDLIBS += -lcrypto
+aes_cbc_ossl_perf_LDFLAGS = -lcrypto
+cbc_std_vectors_random_test: LDLIBS += -lcrypto
+aes_cbc_std_vectors_random_test_LDFLAGS = -lcrypto
+gcm_ossl_perf: LDLIBS += -lcrypto
+aes_gcm_ossl_perf_LDFLAGS = -lcrypto
+gcm_std_vectors_random_test: LDLIBS += -lcrypto
+aes_gcm_std_vectors_random_test_LDFLAGS = -lcrypto
+xts_128_enc_ossl_perf: LDLIBS += -lcrypto
+aes_xts_128_enc_ossl_perf_LDFLAGS = -lcrypto
+xts_128_dec_ossl_perf: LDLIBS += -lcrypto
+aes_xts_128_dec_ossl_perf_LDFLAGS = -lcrypto
+xts_128_rand_ossl_test: LDLIBS += -lcrypto
+aes_xts_128_rand_ossl_test_LDFLAGS = -lcrypto
+xts_256_enc_ossl_perf : LDLIBS += -lcrypto
+aes_xts_256_enc_ossl_perf_LDFLAGS = -lcrypto
+xts_256_dec_ossl_perf : LDLIBS += -lcrypto
+aes_xts_256_dec_ossl_perf_LDFLAGS = -lcrypto
+xts_256_rand_ossl_test: LDLIBS += -lcrypto
+aes_xts_256_rand_ossl_test_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm
new file mode 100644
index 000000000..ebd1646af
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_avx.asm
@@ -0,0 +1,1777 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_avx(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%xtmp2 %6
+%define %%ptr_key2 %7
+%define %%ptr_key1 %8
+%define %%ptr_expanded_keys %9
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*0], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values are generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_dec_avx:function
+XTS_AES_128_dec_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;decrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm
new file mode 100644
index 000000000..151113e70
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_avx.asm
@@ -0,0 +1,1747 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_dec_expanded_key_avx:function
+XTS_AES_128_dec_expanded_key_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm
new file mode 100644
index 000000000..5ac14416b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_expanded_key_sse.asm
@@ -0,0 +1,1746 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*0]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_dec_expanded_key_sse:function
+XTS_AES_128_dec_expanded_key_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm
new file mode 100644
index 000000000..1fe565594
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_dec_sse.asm
@@ -0,0 +1,1778 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_dec_sse(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" decryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 9
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%xtmp2 %6
+%define %%ptr_key2 %7
+%define %%ptr_key1 %8
+%define %%ptr_expanded_keys %9
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*9], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*8], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*7], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*6], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*5], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*4], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*3], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*2], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys + 16*1], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*0], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; decrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are decrypted
+; next 8 Tweak values are generated
+%macro decrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks decrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; decrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro decrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_dec_sse:function
+XTS_AES_128_dec_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ decrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;decrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdeclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ decrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm
new file mode 100644
index 000000000..572c6195c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_avx.asm
@@ -0,0 +1,1530 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_avx(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*3], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*4], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*5], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*6], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*7], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*8], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*9], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys + 16*10], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_enc_avx:function
+XTS_AES_128_enc_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm
new file mode 100644
index 000000000..108123334
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_avx.asm
@@ -0,0 +1,1505 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+default rel
+
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_enc_expanded_key_avx:function
+XTS_AES_128_enc_expanded_key_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm
new file mode 100644
index 000000000..07cf9f679
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_expanded_key_sse.asm
@@ -0,0 +1,1504 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 11 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*11 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*11 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_enc_expanded_key_sse:function
+XTS_AES_128_enc_expanded_key_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm
new file mode 100644
index 000000000..bcdd3a752
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_128_enc_sse.asm
@@ -0,0 +1,1529 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 128-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*19 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*19 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*29 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*11 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_128_enc_sse(
+; UINT8 *k2, // key used for tweaking, 16*1 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*1 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+%macro key_expansion_128 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 1 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 1 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 2 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 2 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 3 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 3 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*3], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 4 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 4 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*4], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 5 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 5 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*5], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 6 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 6 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*6], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x40 ; Generating round key 7 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x40 ; Generating round key 7 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*7], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x80 ; Generating round key 8 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x80 ; Generating round key 8 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*8], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1b ; Generating round key 9 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1b ; Generating round key 9 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*9], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x36 ; Generating round key 10 for key2
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x36 ; Generating round key 10 for key1
+ key_expansion_128 %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ movdqa [%%ptr_expanded_keys + 16*10], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+
+
+
+
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_128_enc_sse:function
+XTS_AES_128_enc_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenclast xmm8, [keys + 16*10] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm
new file mode 100644
index 000000000..767242d69
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_avx.asm
@@ -0,0 +1,1961 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_avx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%xtmp2 %8
+%define %%ptr_key2 %9
+%define %%ptr_key1 %10
+%define %%ptr_expanded_keys %11
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xtmp2
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ vaesimc %%xtmp2, %%xkey1_2
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_dec_avx:function
+XTS_AES_256_dec_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdec xmm8, [keys + 16*10] ; round 9
+ vaesdec xmm8, [keys + 16*11] ; round 9
+ vaesdec xmm8, [keys + 16*12] ; round 9
+ vaesdec xmm8, [keys + 16*13] ; round 9
+ vaesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm
new file mode 100644
index 000000000..d5a75d0ad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_avx.asm
@@ -0,0 +1,1895 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*14]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*13]
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*12]
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*11]
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*11]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*12]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*13]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*14]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*0]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesdec %%ST1, %%T0
+ vaesdec %%ST2, %%T0
+ vaesdec %%ST3, %%T0
+ vaesdec %%ST4, %%T0
+ vaesdec %%ST5, %%T0
+ vaesdec %%ST6, %%T0
+ vaesdec %%ST7, %%T0
+ vaesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesdeclast %%ST1, %%T0
+ vaesdeclast %%ST2, %%T0
+ vaesdeclast %%ST3, %%T0
+ vaesdeclast %%ST4, %%T0
+ vaesdeclast %%ST5, %%T0
+ vaesdeclast %%ST6, %%T0
+ vaesdeclast %%ST7, %%T0
+ vaesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_dec_expanded_key_avx:function
+XTS_AES_256_dec_expanded_key_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ vmovdqa xmm1, [TW + 16*7]
+ vmovdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesdec xmm8, [keys + 16*1] ; round 1
+ vaesdec xmm8, [keys + 16*2] ; round 2
+ vaesdec xmm8, [keys + 16*3] ; round 3
+ vaesdec xmm8, [keys + 16*4] ; round 4
+ vaesdec xmm8, [keys + 16*5] ; round 5
+ vaesdec xmm8, [keys + 16*6] ; round 6
+ vaesdec xmm8, [keys + 16*7] ; round 7
+ vaesdec xmm8, [keys + 16*8] ; round 8
+ vaesdec xmm8, [keys + 16*9] ; round 9
+ vaesdec xmm8, [keys + 16*10] ; round 9
+ vaesdec xmm8, [keys + 16*11] ; round 9
+ vaesdec xmm8, [keys + 16*12] ; round 9
+ vaesdec xmm8, [keys + 16*13] ; round 9
+ vaesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm15
+ vmovdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm14
+ vmovdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm13
+ vmovdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm12
+ vmovdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm11
+ vmovdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm10
+ vmovdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ vmovdqa [TW + 16*0] , xmm9
+ vmovdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm
new file mode 100644
index 000000000..6ee4e3378
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_expanded_key_sse.asm
@@ -0,0 +1,1897 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*14]
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*13]
+ movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*12]
+ movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*11]
+ movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*11]
+ aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*12]
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*13]
+ aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*14]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*0]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_dec_expanded_key_sse:function
+XTS_AES_256_dec_expanded_key_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdec xmm8, [keys + 16*10] ; round 9
+ aesdec xmm8, [keys + 16*11] ; round 9
+ aesdec xmm8, [keys + 16*12] ; round 9
+ aesdec xmm8, [keys + 16*13] ; round 9
+ aesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm
new file mode 100644
index 000000000..6ea1ae8b8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_dec_sse.asm
@@ -0,0 +1,1962 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS decrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_dec_sse(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *ct, // ciphertext sector input data
+; UINT8 *pt); // plaintext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 11
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%xtmp2 %8
+%define %%ptr_key2 %9
+%define %%ptr_key1 %10
+%define %%ptr_expanded_keys %11
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ movdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*13], %%xtmp2
+
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*12], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*11], %%xtmp2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*10], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*9], %%xtmp2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*8], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*7], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*6], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*5], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*4], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*3], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ aesimc %%xtmp2, %%xkey1
+ movdqa [%%ptr_expanded_keys+16*2], %%xtmp2
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ aesimc %%xtmp2, %%xkey1_2
+ movdqa [%%ptr_expanded_keys+16*1], %%xtmp2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdec %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdec %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdec %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdec %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdec %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdec %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesdeclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesdeclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesdeclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesdeclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesdeclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesdeclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesdec %%ST1, %%T0
+ aesdec %%ST2, %%T0
+ aesdec %%ST3, %%T0
+ aesdec %%ST4, %%T0
+ aesdec %%ST5, %%T0
+ aesdec %%ST6, %%T0
+ aesdec %%ST7, %%T0
+ aesdec %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesdeclast %%ST1, %%T0
+ aesdeclast %%ST2, %%T0
+ aesdeclast %%ST3, %%T0
+ aesdeclast %%ST4, %%T0
+ aesdeclast %%ST5, %%T0
+ aesdeclast %%ST6, %%T0
+ aesdeclast %%ST7, %%T0
+ aesdeclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_dec_sse:function
+XTS_AES_256_dec_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, xmm7, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_final
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ movdqa xmm1, [TW + 16*7]
+ movdqa [TW + 16*0], xmm1 ; swap tweak values for cipher stealing for decrypt
+
+ mov [TW + 16*7], twtempl
+ mov [TW + 16*7+8], twtemph
+
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ jmp _steal_cipher
+
+
+_done_final:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+ jmp _done
+
+
+_steal_cipher:
+ ; start cipher stealing
+
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesdec xmm8, [keys + 16*1] ; round 1
+ aesdec xmm8, [keys + 16*2] ; round 2
+ aesdec xmm8, [keys + 16*3] ; round 3
+ aesdec xmm8, [keys + 16*4] ; round 4
+ aesdec xmm8, [keys + 16*5] ; round 5
+ aesdec xmm8, [keys + 16*6] ; round 6
+ aesdec xmm8, [keys + 16*7] ; round 7
+ aesdec xmm8, [keys + 16*8] ; round 8
+ aesdec xmm8, [keys + 16*9] ; round 9
+ aesdec xmm8, [keys + 16*10] ; round 9
+ aesdec xmm8, [keys + 16*11] ; round 9
+ aesdec xmm8, [keys + 16*12] ; round 9
+ aesdec xmm8, [keys + 16*13] ; round 9
+ aesdeclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+
+
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+
+ sub ptr_plaintext, 16*1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_7
+
+_steal_cipher_7:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm15
+ movdqa xmm15, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _steal_cipher
+
+_done_7:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+
+ sub ptr_plaintext, 16*2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_6
+
+_steal_cipher_6:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm14
+ movdqa xmm14, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _steal_cipher
+
+_done_6:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+ jmp _done
+
+
+
+
+
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+
+ sub ptr_plaintext, 16*3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_5
+
+_steal_cipher_5:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm13
+ movdqa xmm13, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _steal_cipher
+
+_done_5:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+ jmp _done
+
+
+
+
+
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+
+ sub ptr_plaintext, 16*4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_4
+
+_steal_cipher_4:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm12
+ movdqa xmm12, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _steal_cipher
+
+_done_4:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+ jmp _done
+
+
+
+
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+
+ sub ptr_plaintext, 16*5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_3
+
+_steal_cipher_3:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm11
+ movdqa xmm11, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _steal_cipher
+
+_done_3:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+ jmp _done
+
+
+
+
+
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+
+ sub ptr_plaintext, 16*6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_2
+
+_steal_cipher_2:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm10
+ movdqa xmm10, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _steal_cipher
+
+_done_2:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+ jmp _done
+
+
+
+
+
+
+
+
+
+
+
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done_1
+
+_steal_cipher_1:
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+
+ movdqa [TW + 16*0] , xmm9
+ movdqa xmm9, [TW+16*1]
+
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _steal_cipher
+
+_done_1:
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+ jmp _done
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm
new file mode 100644
index 000000000..e0c1d4934
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_avx.asm
@@ -0,0 +1,1707 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_avx(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of vaeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ vpshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ vpxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ vpxor %%xround_key, %%xtmp
+ vpxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%ptr_key2 %8
+%define %%ptr_key1 %9
+%define %%ptr_expanded_keys %10
+
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ vmovdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1
+
+ vaeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ vaeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ vaesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+ vaeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ vaeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_enc_avx:function
+XTS_AES_256_enc_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenc xmm8, [keys + 16*10] ; round 9
+ vaesenc xmm8, [keys + 16*11] ; round 9
+ vaesenc xmm8, [keys + 16*12] ; round 9
+ vaesenc xmm8, [keys + 16*13] ; round 9
+ vaesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm
new file mode 100644
index 000000000..e4ace9467
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_avx.asm
@@ -0,0 +1,1652 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_avx(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ vmovdqu %%xkey2, [%%ptr_key2]
+ vpxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1]
+ vmovdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*1]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*1]
+ vmovdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*2]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*2]
+ vmovdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*3]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*3]
+ vmovdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*4]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*4]
+ vmovdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*5]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*5]
+ vmovdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*6]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*6]
+ vmovdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*7]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*7]
+ vmovdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*8]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*8]
+ vmovdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*9]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*9]
+ vmovdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*10]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*10]
+ vmovdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*11]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*11]
+ vmovdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*12]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*12]
+ vmovdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*13]
+ vaesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*13]
+ vmovdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+ vmovdqu %%xkey2, [%%ptr_key2 + 16*14]
+ vaesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ vmovdqu %%xkey1, [%%ptr_key1 + 16*14]
+ vmovdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ vmovdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ vmovdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ vmovdqa %%TW2, [TW+16*1]
+ vmovdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ vmovdqa %%TW3, [TW+16*2]
+ vmovdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ vmovdqa %%TW4, [TW+16*3]
+ vmovdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ vmovdqa %%TW5, [TW+16*4]
+ vmovdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ vmovdqa %%TW6, [TW+16*5]
+ vmovdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ vmovdqa %%TW7, [TW+16*6]
+ vmovdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ vaesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ vaesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ vaesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ vaesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ vaesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ vaesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ vpxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ vpxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ vpxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ vpxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ vpxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ vpxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ ; ARK
+ vmovdqa %%T0, [keys]
+ vpxor %%ST1, %%T0
+ vpxor %%ST2, %%T0
+ vpxor %%ST3, %%T0
+ vpxor %%ST4, %%T0
+ vpxor %%ST5, %%T0
+ vpxor %%ST6, %%T0
+ vpxor %%ST7, %%T0
+ vpxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ vmovdqa %%T0, [keys + 16*1]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ vmovdqa %%T0, [keys + 16*2]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ vmovdqa %%T0, [keys + 16*3]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ vmovdqa %%T0, [keys + 16*4]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ vmovdqa %%T0, [keys + 16*5]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ vmovdqa %%T0, [keys + 16*6]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ vmovdqa %%T0, [keys + 16*7]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ vmovdqa %%T0, [keys + 16*8]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ vmovdqa %%T0, [keys + 16*9]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ vmovdqa %%T0, [keys + 16*10]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ vmovdqa %%T0, [keys + 16*11]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ vmovdqa %%T0, [keys + 16*12]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ vmovdqa %%T0, [keys + 16*13]
+ vaesenc %%ST1, %%T0
+ vaesenc %%ST2, %%T0
+ vaesenc %%ST3, %%T0
+ vaesenc %%ST4, %%T0
+ vaesenc %%ST5, %%T0
+ vaesenc %%ST6, %%T0
+ vaesenc %%ST7, %%T0
+ vaesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ vmovdqa %%T0, [keys + 16*14]
+ vaesenclast %%ST1, %%T0
+ vaesenclast %%ST2, %%T0
+ vaesenclast %%ST3, %%T0
+ vaesenclast %%ST4, %%T0
+ vaesenclast %%ST5, %%T0
+ vaesenclast %%ST6, %%T0
+ vaesenclast %%ST7, %%T0
+ vaesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ vpxor %%ST1, %%TW1
+ vpxor %%ST2, %%TW2
+ vpxor %%ST3, %%TW3
+ vpxor %%ST4, %%TW4
+ vpxor %%ST5, %%TW5
+ vpxor %%ST6, %%TW6
+ vpxor %%ST7, %%TW7
+ vpxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ vmovdqa %%TW1, [TW + 16*0]
+ vmovdqa %%TW2, [TW + 16*1]
+ vmovdqa %%TW3, [TW + 16*2]
+ vmovdqa %%TW4, [TW + 16*3]
+ vmovdqa %%TW5, [TW + 16*4]
+ vmovdqa %%TW6, [TW + 16*5]
+ vmovdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_enc_expanded_key_avx:function
+XTS_AES_256_enc_expanded_key_avx:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ vmovdqa [_xmm + 16*0], xmm6
+ vmovdqa [_xmm + 16*1], xmm7
+ vmovdqa [_xmm + 16*2], xmm8
+ vmovdqa [_xmm + 16*3], xmm9
+ vmovdqa [_xmm + 16*4], xmm10
+ vmovdqa [_xmm + 16*5], xmm11
+ vmovdqa [_xmm + 16*6], xmm12
+ vmovdqa [_xmm + 16*7], xmm13
+ vmovdqa [_xmm + 16*8], xmm14
+ vmovdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ vmovdqu xmm1, [T_val] ; read initial Tweak value
+ vpxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ vmovdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ vmovdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ vmovdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ vmovdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ vmovdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ vmovdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ vmovdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ vmovdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;vmovdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ vmovdqu xmm1, [ptr_plaintext+16*0]
+ vmovdqu xmm2, [ptr_plaintext+16*1]
+ vmovdqu xmm3, [ptr_plaintext+16*2]
+ vmovdqu xmm4, [ptr_plaintext+16*3]
+ vmovdqu xmm5, [ptr_plaintext+16*4]
+ vmovdqu xmm6, [ptr_plaintext+16*5]
+ vmovdqu xmm7, [ptr_plaintext+16*6]
+ vmovdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+ vmovdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ vmovdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table]
+ vmovdqu xmm0, [twtempl+N_val]
+ vpshufb xmm8, xmm0
+
+
+ vmovdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ vmovdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [vpshufb_shf_table +16]
+ sub twtempl, N_val
+ vmovdqu xmm0, [twtempl]
+ vpxor xmm0, [mask1]
+ vpshufb xmm3, xmm0
+
+ vpblendvb xmm3, xmm3, xmm2, xmm0 ;xmm0 is implicit
+
+ ; xor Tweak value
+ vmovdqa xmm8, [TW]
+ vpxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of vpxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ vpxor xmm8, [keys] ; ARK
+ vaesenc xmm8, [keys + 16*1] ; round 1
+ vaesenc xmm8, [keys + 16*2] ; round 2
+ vaesenc xmm8, [keys + 16*3] ; round 3
+ vaesenc xmm8, [keys + 16*4] ; round 4
+ vaesenc xmm8, [keys + 16*5] ; round 5
+ vaesenc xmm8, [keys + 16*6] ; round 6
+ vaesenc xmm8, [keys + 16*7] ; round 7
+ vaesenc xmm8, [keys + 16*8] ; round 8
+ vaesenc xmm8, [keys + 16*9] ; round 9
+ vaesenc xmm8, [keys + 16*10] ; round 9
+ vaesenc xmm8, [keys + 16*11] ; round 9
+ vaesenc xmm8, [keys + 16*12] ; round 9
+ vaesenc xmm8, [keys + 16*13] ; round 9
+ vaesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ vpxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ vmovdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ vmovdqa xmm6, [_xmm + 16*0]
+ vmovdqa xmm7, [_xmm + 16*1]
+ vmovdqa xmm8, [_xmm + 16*2]
+ vmovdqa xmm9, [_xmm + 16*3]
+ vmovdqa xmm10, [_xmm + 16*4]
+ vmovdqa xmm11, [_xmm + 16*5]
+ vmovdqa xmm12, [_xmm + 16*6]
+ vmovdqa xmm13, [_xmm + 16*7]
+ vmovdqa xmm14, [_xmm + 16*8]
+ vmovdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+ vmovdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ vmovdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+ vmovdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ vmovdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+ vmovdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ vmovdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+ vmovdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ vmovdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext+16*0], xmm1
+ vmovdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ vmovdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ vmovdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ vmovdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ vmovdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+vpshufb_shf_table:
+; use these values for shift constants for the vpshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm
new file mode 100644
index 000000000..47fe65281
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_expanded_key_sse.asm
@@ -0,0 +1,1651 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; expanded keys are not aligned
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_expanded_key_sse(
+; UINT8 *k2, // key used for tweaking, 16*15 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*15 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; macro to encrypt the tweak value
+
+%macro encrypt_T 8
+%define %%xkey2 %1
+%define %%xstate_tweak %2
+%define %%xkey1 %3
+%define %%xraw_key %4
+%define %%xtmp %5
+%define %%ptr_key2 %6
+%define %%ptr_key1 %7
+%define %%ptr_expanded_keys %8
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*2]
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*2]
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*3]
+ aesenc %%xstate_tweak, %%xkey2 ; round 3 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*3]
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*4]
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*4]
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*5]
+ aesenc %%xstate_tweak, %%xkey2 ; round 5 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*5]
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*6]
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*6]
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*7]
+ aesenc %%xstate_tweak, %%xkey2 ; round 7 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*7]
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*8]
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*8]
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*9]
+ aesenc %%xstate_tweak, %%xkey2 ; round 9 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*9]
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*10]
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*10]
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1 ; store round keys in stack
+
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*11]
+ aesenc %%xstate_tweak, %%xkey2 ; round 11 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*11]
+ movdqa [%%ptr_expanded_keys+16*11], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*12]
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*12]
+ movdqa [%%ptr_expanded_keys+16*12], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*13]
+ aesenc %%xstate_tweak, %%xkey2 ; round 13 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*13]
+ movdqa [%%ptr_expanded_keys+16*13], %%xkey1 ; store round keys in stack
+
+ movdqu %%xkey2, [%%ptr_key2 + 16*14]
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1 + 16*14]
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1 ; store round keys in stack
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_enc_expanded_key_sse:function
+XTS_AES_256_enc_expanded_key_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm1, xmm2, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenc xmm8, [keys + 16*10] ; round 9
+ aesenc xmm8, [keys + 16*11] ; round 9
+ aesenc xmm8, [keys + 16*12] ; round 9
+ aesenc xmm8, [keys + 16*13] ; round 9
+ aesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm
new file mode 100644
index 000000000..244c33b36
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/XTS_AES_256_enc_sse.asm
@@ -0,0 +1,1707 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; XTS encrypt function with 256-bit AES
+; input keys are not aligned
+; keys are expanded in parallel with the tweak encryption
+; plaintext and ciphertext are not aligned
+; second key is stored in the stack as aligned to 16 Bytes
+; first key is required only once, no need for storage of this key
+
+%include "reg_sizes.asm"
+
+default rel
+%define TW rsp ; store 8 tweak values
+%define keys rsp + 16*8 ; store 15 expanded keys
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define _xmm rsp + 16*23 ; store xmm6:xmm15
+%endif
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define _gpr rsp + 16*23 ; store rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 8*1 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%else
+%define _gpr rsp + 16*33 ; store rdi, rsi, rbx
+%define VARIABLE_OFFSET 16*8 + 16*15 + 16*10 + 8*3 ; VARIABLE_OFFSET has to be an odd multiple of 8
+%endif
+
+%define GHASH_POLY 0x87
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void XTS_AES_256_enc_sse(
+; UINT8 *k2, // key used for tweaking, 16*2 bytes
+; UINT8 *k1, // key used for "ECB" encryption, 16*2 bytes
+; UINT8 *TW_initial, // initial tweak value, 16 bytes
+; UINT64 N, // sector size, in bytes
+; const UINT8 *pt, // plaintext sector input data
+; UINT8 *ct); // ciphertext sector output data
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; arguments for input parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %xdefine ptr_key2 rdi
+ %xdefine ptr_key1 rsi
+ %xdefine T_val rdx
+ %xdefine N_val rcx
+ %xdefine ptr_plaintext r8
+ %xdefine ptr_ciphertext r9
+%else
+ %xdefine ptr_key2 rcx
+ %xdefine ptr_key1 rdx
+ %xdefine T_val r8
+ %xdefine N_val r9
+ %xdefine ptr_plaintext r10; [rsp + VARIABLE_OFFSET + 8*5]
+ %xdefine ptr_ciphertext r11; [rsp + VARIABLE_OFFSET + 8*6]
+%endif
+
+; arguments for temp parameters
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define tmp1 rdi
+ %define target_ptr_val rsi
+ %define ghash_poly_8b r10
+ %define ghash_poly_8b_temp r11
+%else
+ %define tmp1 rcx
+ %define target_ptr_val rdx
+ %define ghash_poly_8b rdi
+ %define ghash_poly_8b_temp rsi
+%endif
+
+%define twtempl rax ; global temp registers used for tweak computation
+%define twtemph rbx
+
+
+; produce the key for the next round
+; raw_key is the output of aeskeygenassist instruction
+; round_key value before this key_expansion_128 macro is current round key
+; round_key value after this key_expansion_128 macro is next round key
+; 2 macros will be used for key generation in a flip-flopped fashion
+%macro key_expansion_256_flip 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 11111111b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+%macro key_expansion_256_flop 3
+%define %%xraw_key %1
+%define %%xtmp %2
+%define %%xround_key %3
+ pshufd %%xraw_key, %%xraw_key, 10101010b
+ shufps %%xtmp, %%xround_key, 00010000b
+ pxor %%xround_key, %%xtmp
+ shufps %%xtmp, %%xround_key, 10001100b
+ pxor %%xround_key, %%xtmp
+ pxor %%xround_key, %%xraw_key
+%endmacro
+
+
+
+
+; macro to encrypt the tweak value in parallel with key generation of both keys
+
+%macro encrypt_T 10
+%define %%xkey2 %1
+%define %%xkey2_2 %2
+%define %%xstate_tweak %3
+%define %%xkey1 %4
+%define %%xkey1_2 %5
+%define %%xraw_key %6
+%define %%xtmp %7
+%define %%ptr_key2 %8
+%define %%ptr_key1 %9
+%define %%ptr_expanded_keys %10
+
+
+ movdqu %%xkey2, [%%ptr_key2]
+ pxor %%xstate_tweak, %%xkey2 ; ARK for tweak encryption
+
+ movdqu %%xkey1, [%%ptr_key1]
+ movdqa [%%ptr_expanded_keys+16*0], %%xkey1
+
+ movdqu %%xkey2_2, [%%ptr_key2 + 16*1]
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 1 for tweak encryption
+
+ movdqu %%xkey1_2, [%%ptr_key1 + 16*1]
+ movdqa [%%ptr_expanded_keys+16*1], %%xkey1_2
+
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x1 ; Generating round key 2 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x1 ; Generating round key 2 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 2 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*2], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x1 ; Generating round key 3 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x1 ; Generating round key 3 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 3 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*3], %%xkey1_2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x2 ; Generating round key 4 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x2 ; Generating round key 4 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 4 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*4], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x2 ; Generating round key 5 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x2 ; Generating round key 5 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 5 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*5], %%xkey1_2
+
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x4 ; Generating round key 6 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x4 ; Generating round key 6 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 6 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*6], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x4 ; Generating round key 7 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x4 ; Generating round key 7 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 7 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*7], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x8 ; Generating round key 8 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x8 ; Generating round key 8 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 8 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*8], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x8 ; Generating round key 9 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x8 ; Generating round key 9 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 9 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*9], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x10 ; Generating round key 10 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x10 ; Generating round key 10 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 10 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*10], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x10 ; Generating round key 11 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x10 ; Generating round key 11 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 11 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*11], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x20 ; Generating round key 12 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x20 ; Generating round key 12 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenc %%xstate_tweak, %%xkey2 ; round 12 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*12], %%xkey1
+
+ aeskeygenassist %%xraw_key, %%xkey2, 0x20 ; Generating round key 13 for key2
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey2_2
+ aeskeygenassist %%xraw_key, %%xkey1, 0x20 ; Generating round key 13 for key1
+ key_expansion_256_flop %%xraw_key, %%xtmp, %%xkey1_2
+ aesenc %%xstate_tweak, %%xkey2_2 ; round 13 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*13], %%xkey1_2
+
+
+ aeskeygenassist %%xraw_key, %%xkey2_2, 0x40 ; Generating round key 14 for key2
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey2
+ aeskeygenassist %%xraw_key, %%xkey1_2, 0x40 ; Generating round key 14 for key1
+ key_expansion_256_flip %%xraw_key, %%xtmp, %%xkey1
+ aesenclast %%xstate_tweak, %%xkey2 ; round 14 for tweak encryption
+ movdqa [%%ptr_expanded_keys+16*14], %%xkey1
+
+ movdqa [TW], %%xstate_tweak ; Store the encrypted Tweak value
+%endmacro
+
+
+; generate initial tweak values
+; load initial plaintext values
+%macro initialize 16
+
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+
+%define %%num_initial_blocks %16
+
+
+ ; generate next Tweak values
+ movdqa %%TW1, [TW+16*0]
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqu %%ST1, [ptr_plaintext+16*0]
+%if (%%num_initial_blocks>=2)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph;
+ movdqa %%TW2, [TW+16*1]
+ movdqu %%ST2, [ptr_plaintext+16*1]
+%endif
+%if (%%num_initial_blocks>=3)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph;
+ movdqa %%TW3, [TW+16*2]
+ movdqu %%ST3, [ptr_plaintext+16*2]
+%endif
+%if (%%num_initial_blocks>=4)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph;
+ movdqa %%TW4, [TW+16*3]
+ movdqu %%ST4, [ptr_plaintext+16*3]
+%endif
+%if (%%num_initial_blocks>=5)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph;
+ movdqa %%TW5, [TW+16*4]
+ movdqu %%ST5, [ptr_plaintext+16*4]
+%endif
+%if (%%num_initial_blocks>=6)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph;
+ movdqa %%TW6, [TW+16*5]
+ movdqu %%ST6, [ptr_plaintext+16*5]
+%endif
+%if (%%num_initial_blocks>=7)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph;
+ movdqa %%TW7, [TW+16*6]
+ movdqu %%ST7, [ptr_plaintext+16*6]
+%endif
+
+
+
+%endmacro
+
+
+; encrypt initial blocks of AES
+; 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
+; next 8 Tweak values are generated
+%macro encrypt_initial 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%T0 %16 ; Temp register
+%define %%num_blocks %17
+; %%num_blocks blocks encrypted
+; %%num_blocks can be 1, 2, 3, 4, 5, 6, 7
+
+%define %%lt128 %18 ; less than 128 bytes
+
+ ; xor Tweak value
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%T0
+%endif
+
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ %endif
+
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl ; next Tweak1 generated
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ %endif
+
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*2], twtempl ; next Tweak2 generated
+ %endif
+
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ %if (0 == %%lt128)
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ %endif
+
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl ; next Tweak3 generated
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ %endif
+
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl ; next Tweak4 generated
+ mov [TW + 8*7], twtemph
+ %endif
+
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl ; next Tweak5 generated
+ mov [TW + 8*9], twtemph
+ %endif
+
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*10], twtempl ; next Tweak6 generated
+ mov [TW + 8*11], twtemph
+ %endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl ; next Tweak7 generated
+ mov [TW + 8*13], twtemph
+ %endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ %if (0 == %%lt128)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*14], twtempl ; next Tweak8 generated
+ mov [TW + 8*15], twtemph
+ %endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenc %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenc %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenc %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenc %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenc %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenc %%ST7, %%T0
+%endif
+
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+%if (%%num_blocks>=2)
+ aesenclast %%ST2, %%T0
+%endif
+%if (%%num_blocks>=3)
+ aesenclast %%ST3, %%T0
+%endif
+%if (%%num_blocks>=4)
+ aesenclast %%ST4, %%T0
+%endif
+%if (%%num_blocks>=5)
+ aesenclast %%ST5, %%T0
+%endif
+%if (%%num_blocks>=6)
+ aesenclast %%ST6, %%T0
+%endif
+%if (%%num_blocks>=7)
+ aesenclast %%ST7, %%T0
+%endif
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+%if (%%num_blocks>=2)
+ pxor %%ST2, %%TW2
+%endif
+%if (%%num_blocks>=3)
+ pxor %%ST3, %%TW3
+%endif
+%if (%%num_blocks>=4)
+ pxor %%ST4, %%TW4
+%endif
+%if (%%num_blocks>=5)
+ pxor %%ST5, %%TW5
+%endif
+%if (%%num_blocks>=6)
+ pxor %%ST6, %%TW6
+%endif
+%if (%%num_blocks>=7)
+ pxor %%ST7, %%TW7
+%endif
+
+
+%if (0 == %%lt128)
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endif
+
+%endmacro
+
+
+; Encrypt 8 blocks in parallel
+; generate next 8 tweak values
+%macro encrypt_by_eight 18
+%define %%ST1 %1 ; state 1
+%define %%ST2 %2 ; state 2
+%define %%ST3 %3 ; state 3
+%define %%ST4 %4 ; state 4
+%define %%ST5 %5 ; state 5
+%define %%ST6 %6 ; state 6
+%define %%ST7 %7 ; state 7
+%define %%ST8 %8 ; state 8
+%define %%TW1 %9 ; tweak 1
+%define %%TW2 %10 ; tweak 2
+%define %%TW3 %11 ; tweak 3
+%define %%TW4 %12 ; tweak 4
+%define %%TW5 %13 ; tweak 5
+%define %%TW6 %14 ; tweak 6
+%define %%TW7 %15 ; tweak 7
+%define %%TW8 %16 ; tweak 8
+%define %%T0 %17 ; Temp register
+%define %%last_eight %18
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ ; ARK
+ movdqa %%T0, [keys]
+ pxor %%ST1, %%T0
+ pxor %%ST2, %%T0
+ pxor %%ST3, %%T0
+ pxor %%ST4, %%T0
+ pxor %%ST5, %%T0
+ pxor %%ST6, %%T0
+ pxor %%ST7, %%T0
+ pxor %%ST8, %%T0
+
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 1
+ movdqa %%T0, [keys + 16*1]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*0], twtempl
+ mov [TW + 8*1], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 2
+ movdqa %%T0, [keys + 16*2]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+
+%endif
+ ; round 3
+ movdqa %%T0, [keys + 16*3]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*2], twtempl
+ mov [TW + 8*3], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 4
+ movdqa %%T0, [keys + 16*4]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*4], twtempl
+%endif
+ ; round 5
+ movdqa %%T0, [keys + 16*5]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*5], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 6
+ movdqa %%T0, [keys + 16*6]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*6], twtempl
+ mov [TW + 8*7], twtemph
+%endif
+ ; round 7
+ movdqa %%T0, [keys + 16*7]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+%endif
+ ; round 8
+ movdqa %%T0, [keys + 16*8]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*8], twtempl
+ mov [TW + 8*9], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+%endif
+ ; round 9
+ movdqa %%T0, [keys + 16*9]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+%endif
+ ; round 10
+ movdqa %%T0, [keys + 16*10]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*10], twtempl
+ mov [TW + 8*11], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+%endif
+ ; round 11
+ movdqa %%T0, [keys + 16*11]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW + 8*12], twtempl
+%endif
+ ; round 12
+ movdqa %%T0, [keys + 16*12]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ mov [TW + 8*13], twtemph
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+%endif
+ ; round 13
+ movdqa %%T0, [keys + 16*13]
+ aesenc %%ST1, %%T0
+ aesenc %%ST2, %%T0
+ aesenc %%ST3, %%T0
+ aesenc %%ST4, %%T0
+ aesenc %%ST5, %%T0
+ aesenc %%ST6, %%T0
+ aesenc %%ST7, %%T0
+ aesenc %%ST8, %%T0
+%if (0 == %%last_eight)
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+; mov [TW + 8*14], twtempl
+; mov [TW + 8*15], twtemph
+%endif
+ ; round 14
+ movdqa %%T0, [keys + 16*14]
+ aesenclast %%ST1, %%T0
+ aesenclast %%ST2, %%T0
+ aesenclast %%ST3, %%T0
+ aesenclast %%ST4, %%T0
+ aesenclast %%ST5, %%T0
+ aesenclast %%ST6, %%T0
+ aesenclast %%ST7, %%T0
+ aesenclast %%ST8, %%T0
+
+ ; xor Tweak values
+ pxor %%ST1, %%TW1
+ pxor %%ST2, %%TW2
+ pxor %%ST3, %%TW3
+ pxor %%ST4, %%TW4
+ pxor %%ST5, %%TW5
+ pxor %%ST6, %%TW6
+ pxor %%ST7, %%TW7
+ pxor %%ST8, %%TW8
+
+ mov [TW + 8*14], twtempl
+ mov [TW + 8*15], twtemph
+ ; load next Tweak values
+ movdqa %%TW1, [TW + 16*0]
+ movdqa %%TW2, [TW + 16*1]
+ movdqa %%TW3, [TW + 16*2]
+ movdqa %%TW4, [TW + 16*3]
+ movdqa %%TW5, [TW + 16*4]
+ movdqa %%TW6, [TW + 16*5]
+ movdqa %%TW7, [TW + 16*6]
+
+%endmacro
+
+
+section .text
+
+global XTS_AES_256_enc_sse:function
+XTS_AES_256_enc_sse:
+
+ sub rsp, VARIABLE_OFFSET
+
+ mov [_gpr + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [_gpr + 8*1], rdi
+ mov [_gpr + 8*2], rsi
+
+ movdqa [_xmm + 16*0], xmm6
+ movdqa [_xmm + 16*1], xmm7
+ movdqa [_xmm + 16*2], xmm8
+ movdqa [_xmm + 16*3], xmm9
+ movdqa [_xmm + 16*4], xmm10
+ movdqa [_xmm + 16*5], xmm11
+ movdqa [_xmm + 16*6], xmm12
+ movdqa [_xmm + 16*7], xmm13
+ movdqa [_xmm + 16*8], xmm14
+ movdqa [_xmm + 16*9], xmm15
+%endif
+
+ mov ghash_poly_8b, GHASH_POLY ; load 0x87 to ghash_poly_8b
+
+
+ movdqu xmm1, [T_val] ; read initial Tweak value
+ pxor xmm4, xmm4 ; for key expansion
+ encrypt_T xmm0, xmm5, xmm1, xmm2, xmm6, xmm3, xmm4, ptr_key2, ptr_key1, keys
+
+
+%ifidn __OUTPUT_FORMAT__, win64
+ mov ptr_plaintext, [rsp + VARIABLE_OFFSET + 8*5] ; plaintext pointer
+ mov ptr_ciphertext, [rsp + VARIABLE_OFFSET + 8*6] ; ciphertext pointer
+%endif
+
+
+
+ mov target_ptr_val, N_val
+ and target_ptr_val, -16 ; target_ptr_val = target_ptr_val - (target_ptr_val mod 16)
+ sub target_ptr_val, 128 ; adjust target_ptr_val because last 4 blocks will not be stitched with Tweak calculations
+ jl _less_than_128_bytes
+
+ add target_ptr_val, ptr_ciphertext
+
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ jz _initial_num_blocks_is_0
+
+ cmp tmp1, (4 << 4)
+ je _initial_num_blocks_is_4
+
+
+
+ cmp tmp1, (6 << 4)
+ je _initial_num_blocks_is_6
+
+ cmp tmp1, (5 << 4)
+ je _initial_num_blocks_is_5
+
+
+
+ cmp tmp1, (3 << 4)
+ je _initial_num_blocks_is_3
+
+ cmp tmp1, (2 << 4)
+ je _initial_num_blocks_is_2
+
+ cmp tmp1, (1 << 4)
+ je _initial_num_blocks_is_1
+
+_initial_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ add ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ add ptr_ciphertext, 16*7
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ add ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ add ptr_ciphertext, 16*6
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ add ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ add ptr_ciphertext, 16*5
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ add ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ add ptr_ciphertext, 16*4
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+
+_initial_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ add ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ add ptr_ciphertext, 16*3
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+_initial_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ add ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ movdqu [ptr_ciphertext+16], xmm2
+ add ptr_ciphertext, 16*2
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+ add ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 0
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+ add ptr_ciphertext, 16
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+
+ jmp _main_loop
+
+_initial_num_blocks_is_0:
+ mov twtempl, [TW+8*0]
+ mov twtemph, [TW+8*1]
+ movdqa xmm9, [TW+16*0]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*2], twtempl
+ mov [TW+8*3], twtemph
+ movdqa xmm10, [TW+16*1]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*4], twtempl
+ mov [TW+8*5], twtemph
+ movdqa xmm11, [TW+16*2]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*6], twtempl
+ mov [TW+8*7], twtemph
+ movdqa xmm12, [TW+16*3]
+
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*8], twtempl
+ mov [TW+8*9], twtemph
+ movdqa xmm13, [TW+16*4]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*10], twtempl
+ mov [TW+8*11], twtemph
+ movdqa xmm14, [TW+16*5]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*12], twtempl
+ mov [TW+8*13], twtemph
+ movdqa xmm15, [TW+16*6]
+
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW+8*14], twtempl
+ mov [TW+8*15], twtemph
+ ;movdqa xmm16, [TW+16*7]
+
+ cmp ptr_ciphertext, target_ptr_val
+ je _last_eight
+_main_loop:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+
+ add ptr_plaintext, 128
+
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 0
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+ movdqu [ptr_ciphertext+16*7], xmm8
+ add ptr_ciphertext, 128
+
+ cmp ptr_ciphertext, target_ptr_val
+ jne _main_loop
+
+_last_eight:
+ ; load plaintext
+ movdqu xmm1, [ptr_plaintext+16*0]
+ movdqu xmm2, [ptr_plaintext+16*1]
+ movdqu xmm3, [ptr_plaintext+16*2]
+ movdqu xmm4, [ptr_plaintext+16*3]
+ movdqu xmm5, [ptr_plaintext+16*4]
+ movdqu xmm6, [ptr_plaintext+16*5]
+ movdqu xmm7, [ptr_plaintext+16*6]
+ movdqu xmm8, [ptr_plaintext+16*7]
+ encrypt_by_eight xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, [TW+16*7], xmm0, 1
+
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+ movdqu [ptr_ciphertext+16*6], xmm7
+
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+_steal_cipher:
+ ; start cipher stealing
+
+ ; generate next Tweak value
+ xor ghash_poly_8b_temp, ghash_poly_8b_temp
+ shl twtempl, 1
+ adc twtemph, twtemph
+ cmovc ghash_poly_8b_temp, ghash_poly_8b
+ xor twtempl, ghash_poly_8b_temp
+ mov [TW], twtempl
+ mov [TW + 8], twtemph
+
+ movdqa xmm2, xmm8
+
+ ; shift xmm8 to the left by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table]
+ movdqu xmm0, [twtempl+N_val]
+ pshufb xmm8, xmm0
+
+
+ movdqu xmm3, [ptr_plaintext + 112 + N_val] ; state register is temporarily xmm3 to eliminate a move
+ movdqu [ptr_ciphertext + 112 + N_val], xmm8
+
+ ; shift xmm3 to the right by 16-N_val bytes
+ lea twtempl, [pshufb_shf_table +16]
+ sub twtempl, N_val
+ movdqu xmm0, [twtempl]
+ pxor xmm0, [mask1]
+ pshufb xmm3, xmm0
+
+ pblendvb xmm3, xmm2 ;xmm0 is implicit
+
+ ; xor Tweak value
+ movdqa xmm8, [TW]
+ pxor xmm8, xmm3 ; state register is xmm8, instead of a move from xmm3 to xmm8, destination register of pxor instruction is swapped
+
+
+ ;encrypt last block with cipher stealing
+ pxor xmm8, [keys] ; ARK
+ aesenc xmm8, [keys + 16*1] ; round 1
+ aesenc xmm8, [keys + 16*2] ; round 2
+ aesenc xmm8, [keys + 16*3] ; round 3
+ aesenc xmm8, [keys + 16*4] ; round 4
+ aesenc xmm8, [keys + 16*5] ; round 5
+ aesenc xmm8, [keys + 16*6] ; round 6
+ aesenc xmm8, [keys + 16*7] ; round 7
+ aesenc xmm8, [keys + 16*8] ; round 8
+ aesenc xmm8, [keys + 16*9] ; round 9
+ aesenc xmm8, [keys + 16*10] ; round 9
+ aesenc xmm8, [keys + 16*11] ; round 9
+ aesenc xmm8, [keys + 16*12] ; round 9
+ aesenc xmm8, [keys + 16*13] ; round 9
+ aesenclast xmm8, [keys + 16*14] ; round 10
+
+ ; xor Tweak value
+ pxor xmm8, [TW]
+
+_done:
+ ; store last ciphertext value
+ movdqu [ptr_ciphertext+16*7], xmm8
+
+_ret_:
+
+ mov rbx, [_gpr + 8*0]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rdi, [_gpr + 8*1]
+ mov rsi, [_gpr + 8*2]
+
+
+ movdqa xmm6, [_xmm + 16*0]
+ movdqa xmm7, [_xmm + 16*1]
+ movdqa xmm8, [_xmm + 16*2]
+ movdqa xmm9, [_xmm + 16*3]
+ movdqa xmm10, [_xmm + 16*4]
+ movdqa xmm11, [_xmm + 16*5]
+ movdqa xmm12, [_xmm + 16*6]
+ movdqa xmm13, [_xmm + 16*7]
+ movdqa xmm14, [_xmm + 16*8]
+ movdqa xmm15, [_xmm + 16*9]
+%endif
+
+ add rsp, VARIABLE_OFFSET
+
+ ret
+
+
+
+
+
+_less_than_128_bytes:
+ cmp N_val, 16
+ jb _ret_
+
+ mov tmp1, N_val
+ and tmp1, (7 << 4)
+ cmp tmp1, (6 << 4)
+ je _num_blocks_is_6
+ cmp tmp1, (5 << 4)
+ je _num_blocks_is_5
+ cmp tmp1, (4 << 4)
+ je _num_blocks_is_4
+ cmp tmp1, (3 << 4)
+ je _num_blocks_is_3
+ cmp tmp1, (2 << 4)
+ je _num_blocks_is_2
+ cmp tmp1, (1 << 4)
+ je _num_blocks_is_1
+
+_num_blocks_is_7:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 7
+ sub ptr_plaintext, 16*1
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 7, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+ movdqu [ptr_ciphertext+16*5], xmm6
+
+ sub ptr_ciphertext, 16*1
+ movdqa xmm8, xmm7
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_6:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 6
+ sub ptr_plaintext, 16*2
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 6, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+ movdqu [ptr_ciphertext+16*4], xmm5
+
+ sub ptr_ciphertext, 16*2
+ movdqa xmm8, xmm6
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_5:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 5
+ sub ptr_plaintext, 16*3
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 5, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+ movdqu [ptr_ciphertext+16*3], xmm4
+
+ sub ptr_ciphertext, 16*3
+ movdqa xmm8, xmm5
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_4:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 4
+ sub ptr_plaintext, 16*4
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 4, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+ movdqu [ptr_ciphertext+16*2], xmm3
+
+ sub ptr_ciphertext, 16*4
+ movdqa xmm8, xmm4
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+_num_blocks_is_3:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 3
+ sub ptr_plaintext, 16*5
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 3, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext+16*0], xmm1
+ movdqu [ptr_ciphertext+16*1], xmm2
+
+ sub ptr_ciphertext, 16*5
+ movdqa xmm8, xmm3
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+_num_blocks_is_2:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 2
+ sub ptr_plaintext, 16*6
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 2, 1
+ ; store ciphertext
+ movdqu [ptr_ciphertext], xmm1
+
+ sub ptr_ciphertext, 16*6
+ movdqa xmm8, xmm2
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+
+_num_blocks_is_1:
+ initialize xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 1
+
+ sub ptr_plaintext, 16*7
+ encrypt_initial xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm0, 1, 1
+ ; store ciphertext
+
+ sub ptr_ciphertext, 16*7
+ movdqa xmm8, xmm1
+
+ and N_val, 15 ; N_val = N_val mod 16
+ je _done
+ jmp _steal_cipher
+
+section .data
+align 16
+
+pshufb_shf_table:
+; use these values for shift constants for the pshufb instruction
+; different alignments result in values as shown:
+; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
+; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
+; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
+; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
+; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
+; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
+; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7
+; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8
+; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9
+; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10
+; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11
+; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12
+; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13
+; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14
+; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15
+dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
+dq 0x0706050403020100, 0x000e0d0c0b0a0908
+
+mask1:
+dq 0x8080808080808080, 0x8080808080808080
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm
new file mode 100644
index 000000000..2fbc3b2af
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_common.asm
@@ -0,0 +1,427 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; the following defines control the operation of the macros below and
+; need to be defines in the including file
+; KEY_ROUNDS - number of key rounds needed based on key length: 128bit - 11, 192bit - 13 or 256bit - 15
+; EARLY_BLOCKS - number of data block to load before starting computations
+; PARALLEL_BLOCKS - number of blocks of data to process in parallel also the number of xmm regs to reserve for data
+; IV_CNT - number of xmm regs to use for IV data valid values of 0 or 1
+; TMP_CNT - number of tmp xmm register to reserve
+; XMM_USAGE - number of xmm registers to use. must be at least the same as PARALLEL_BLOCKS + 2
+;
+
+%include "reg_sizes.asm"
+
+;
+; the following instructions set specific macros must be defined in the user file
+; to make use of the AES macros below
+; MOVDQ - move from memory to xmm reg
+; PXOR - XOR of two xmm registers pxor
+; AES_DEC - AES block decode for early key rounds
+; AES_DEC_LAST - AES block decode for last key round
+; or
+; AES_ENC - AES block encode for early key rounds
+; AES_ENC_LAST - AES block encode for last key round
+
+; Three usages of xmm regs: key round cache, blocks data and one temp
+; CKEY_CNT are (number of xmm regs) - PARALLEL_BLOCKS - IV holder - 2 TMP mmx reg
+%assign FIRST_XDATA (0)
+%assign IV_IDX (FIRST_XDATA + PARALLEL_BLOCKS)
+%ifndef IV_CNT
+%define IV_CNT (1)
+%endif
+%assign TMP (IV_IDX + IV_CNT)
+%assign TMP_CNT (2)
+%assign FIRST_CKEY (TMP + TMP_CNT)
+%assign CKEY_CNT (XMM_USAGE - (PARALLEL_BLOCKS + IV_CNT + TMP_CNT))
+
+; Abstract xmm register usages that identify the expected contents of the register
+%define reg(i) xmm %+ i
+%define XDATA(i) xmm %+ i
+%define KEY_REG(i) xmm %+ i
+%define IV_REG(i) xmm %+ i
+
+%define IDX rax
+
+
+
+
+;
+;
+; AES CBC ENCODE MACROS
+;
+;
+
+;
+; CBC_DECRYPT_BLOCKS
+; Decrypts a number of blocks using AES_PARALLEL_ENC_BLOCKS macro
+; Finalized the decryption and saves results in the output
+; places last last buffers crypto text in IV for next buffer
+; updates the index and number of bytes left
+;
+%macro CBC_DECRYPT_BLOCKS 17
+%define %%TOT_ROUNDS %1
+%define %%num_blocks %2 ; can be 0..13
+%define %%EARLY_LOADS %3 ; number of data blocks to laod before processing
+%define %%MOVDQ %4
+%define %%PXOR %5
+%define %%AES_DEC %6
+%define %%AES_DEC_LAST %7
+%define %%CACHED_KEYS %8 ; number of key data cached in xmm regs
+%define %%TMP %9
+%define %%TMP_CNT %10
+%define %%FIRST_CKEY %11
+%define %%KEY_DATA %12
+%define %%FIRST_XDATA %13
+%define %%IN %14 ; input data
+%define %%OUT %15 ; output data
+%define %%IDX %16 ; index into input and output data buffers
+%define %%LEN %17
+
+ AES_PARALLEL_ENC_BLOCKS %%TOT_ROUNDS, %%num_blocks, %%EARLY_LOADS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST, %%CACHED_KEYS, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%KEY_DATA, %%FIRST_XDATA, %%IN, %%OUT, %%IDX
+
+ ;
+ ; XOR the result of each block's decrypt with the previous block's cypher text (C)
+ ;
+ %assign i 0
+ %rep (%%num_blocks)
+ %%PXOR XDATA(i), XDATA(IV_IDX) ; XOR result with previous block's C
+ %%MOVDQ [%%OUT + %%IDX + i*16], XDATA(i) ; save plain text to out
+ %%MOVDQ XDATA(IV_IDX), [%%IN + IDX + i*16] ; load IV with current block C
+ %assign i (i+1)
+ %endrep
+
+ add %%IDX, %%num_blocks*16
+ sub %%LEN, %%num_blocks*16
+%endmacro
+
+
+;
+; CBC_ENC_INIT
+; XOR first data block with the IV data
+%macro CBC_ENC_INIT 7
+%define %%P_FIRST %1
+%define %%IV_IDX %2
+%define %%MOVDQ %3
+%define %%PXOR %4
+%define %%IV %5
+%define %%IN %6 ; input data
+%define %%IDX %7 ; index into input and output data buffers
+
+ %%MOVDQ XDATA(%%P_FIRST), [%%IN + %%IDX + 0*16]
+ %%MOVDQ reg(%%IV_IDX), [%%IV]
+ %%PXOR XDATA(%%P_FIRST), reg(%%IV_IDX)
+%endmacro
+
+;
+; assumptions:
+; LEN is length of data remaining
+; IDX is offset into the data buffer
+;
+; subloops
+; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
+; load first uncached key into TMP0 (if any)
+; AES block encript XDATA(P_FIRST)
+; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(P_FIRST))
+; save current (XDATA(P_FIRST))
+; update indexes for P_FIRST
+; end if data zero
+;
+%macro CBC_ENC_SUBLOOP 17
+%define %%TOT_ROUNDS %1
+%define %%BLOCKS %2 ; can be 1...14
+%define %%START_DATA %3
+%define %%MOVDQ %4
+%define %%PXOR %5
+%define %%AES_DEC %6
+%define %%AES_DEC_LAST %7
+%define %%TMP %8
+%define %%TMP_CNT %9
+%define %%FIRST_CKEY %10
+%define %%CKEY_CNT %11
+%define %%KEYS %12
+%define %%CACHED_KEYS %13
+%define %%IN %14 ; input data
+%define %%OUT %15 ; output data
+%define %%IDX %16 ; index into input and output data buffers
+%define %%LEN %17
+
+ %assign this_blk 0
+ %assign next_blk 1
+ %assign p_first %%START_DATA
+ %assign p_next (p_first+1)
+ ; for number of blocks to be processed in a loop
+ %assign blk 1
+ %rep %%BLOCKS
+ ; if data > 16 load next block into a next XDATA reg (XDATA(p_next))
+ cmp %%LEN, 16
+ %push skip_read
+ je %$skip_read_next
+ %%MOVDQ XDATA(p_next), [%%IN + %%IDX + next_blk*16]
+ %$skip_read_next:
+ %pop
+
+ AES_ENC_BLOCKS %%TOT_ROUNDS, p_first, %%TMP, %%TMP_CNT, %%FIRST_CKEY, %%CKEY_CNT, %%KEYS, %%MOVDQ, %%PXOR, %%AES_DEC, %%AES_DEC_LAST
+
+ ; if data > 16 XOR next2 block (XDATA(p_next)) with current (XDATA(p_first))
+ cmp %%LEN, 16
+ %push skip_next
+ je %$skip_next_blk_start
+ %%PXOR XDATA(p_next), XDATA(p_first)
+ %$skip_next_blk_start:
+ %pop
+
+ ; save current (XDATA(p_first))
+ %%MOVDQ [%%OUT + %%IDX + this_blk*16], XDATA(p_first)
+ ; update indexes for p_first
+ add %%IDX, 16
+ sub %%LEN, 16
+
+ %if (blk < %%BLOCKS) ; only insert jz if NOT last block
+ ; end if data zero
+ jz %%END_CBC_ENC_SUBLOOP
+ %endif ; (p_next < %%BLOCKS)
+
+ %assign p_first (p_next)
+ %assign blk (blk+1)
+ %if (blk == %%BLOCKS) ; the last rep loop's read of the next block needs to be into START_DATA
+ %assign p_next (%%START_DATA)
+ %elif (1 == %%BLOCKS)
+ %%MOVDQ XDATA(%%START_DATA), XDATA(p_next)
+ %else
+ %assign p_next (p_next+1)
+ %endif
+ %endrep ; %%BLOCKS
+
+ %%END_CBC_ENC_SUBLOOP:
+%endm ; CBC_ENC_SUBLOOP
+
+
+;
+;
+; AES BLOCK ENCODE MACROS
+;
+;
+
+;
+; FILL_KEY_CACHE
+; Load key data into the cache key xmm regs
+%macro FILL_KEY_CACHE 4
+%define %%CACHED_KEYS %1
+%define %%CKEY_START %2
+%define %%KEY_DATA %3
+%define %%MOVDQ %4
+
+ %assign rnd 0
+ %rep KEY_ROUNDS
+ %if (rnd < %%CACHED_KEYS) ; find the round's key data
+ %assign c (rnd + %%CKEY_START)
+ %%MOVDQ KEY_REG(c), [%%KEY_DATA + rnd*16] ;load sub key into an available register
+ %endif
+ %assign rnd (rnd+1)
+ %endrep
+%endmacro
+
+;
+; SCHEDULE_DATA_LOAD
+; pre-loades message data into xmm regs
+; updates global 'blocks_loaded' that tracks which data blocks have been loaded
+; 'blocks_loaded' is an in/out global and must be declared in the using macro or function
+%macro SCHEDULE_DATA_LOAD 5
+%define %%PARALLEL_DATA %1
+%define %%EARLY_LOADS %2
+%define %%MOVDQ %3
+%define %%IN %4
+%define %%IDX %5
+
+ %if (blocks_loaded < %%PARALLEL_DATA)
+ ; load cipher text
+ %%MOVDQ XDATA(blocks_loaded), [%%IN + %%IDX + blocks_loaded*16]
+ %assign blocks_loaded (blocks_loaded+1)
+ %endif ; (blocks_loaded < %%PARALLEL_DATA)
+%endmacro ; SCHEDULED_EARLY_DATA_LOADS
+
+;
+; INIT_SELECT_KEY
+; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
+; 'current_tmp' is an in/out global and must be declared in the using macro or function
+%macro INIT_SELECT_KEY 6
+%define %%TOT_ROUNDS %1
+%define %%CACHED_KEYS %2
+%define %%KEY_DATA %3
+%define %%FIRST_TMP %4
+%define %%TMP_CNT %5
+%define %%MOVDQ %6
+
+ %assign current_tmp (%%FIRST_TMP)
+ %if (%%TOT_ROUNDS > %%CACHED_KEYS) ; load the first uncached key into temp reg
+ %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%CACHED_KEYS*16]
+ %endif ; (KEY_ROUNDS > CKEY_CNT)
+%endmacro ; SELECT_KEY
+
+;
+; SELECT_KEY
+; determine which xmm reg holds the key data needed or loades it into the temp register if not cached
+; 'current_tmp' is an in/out global and must be declared in the using macro or function
+%macro SELECT_KEY 8
+%define %%ROUND %1
+%define %%TOT_ROUNDS %2
+%define %%CACHED_KEYS %3
+%define %%FIRST_KEY %4
+%define %%KEY_DATA %5
+%define %%FIRST_TMP %6
+%define %%TMP_CNT %7
+%define %%MOVDQ %8
+
+ ; find the key data for this round
+ %if (%%ROUND < %%CACHED_KEYS) ; is it cached
+ %assign key (%%ROUND + %%FIRST_KEY)
+ %else
+ ; Load non-cached key %%ROUND data ping-ponging between temp regs if more than one
+ %assign key (current_tmp) ; use the previous loaded key data
+ %if (1 == %%TMP_CNT)
+ %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + %%ROUND*16] ; load the next rounds key data
+ %else
+ %assign next_round (%%ROUND+1)
+ %if (next_round < %%TOT_ROUNDS) ; if more rounds to be done
+ %if (current_tmp == %%FIRST_TMP) ; calc the next temp reg to use
+ %assign current_tmp (current_tmp + 1)
+ %else
+ %assign current_tmp (%%FIRST_TMP)
+ %endif ; (current_tmp == %%FIRST_TMP)
+ %%MOVDQ KEY_REG(current_tmp), [%%KEY_DATA + next_round*16] ; load the next rounds key data
+
+ %endif ; (%%ROUND < KEY_ROUNDS)
+ %endif ; (1 < %%TMP_CNT)
+ %endif ; (%%ROUND < %%CACHED_KEYS)
+%endmacro ; SELECT_KEY
+
+
+;
+; AES_PARALLEL_ENC_BLOCKS
+; preloads some data blocks to be worked on
+; starts the aes block encoding while loading the other blocks to be done in parallel
+; aes block encodes each key round on each block
+%macro AES_PARALLEL_ENC_BLOCKS 16
+%define %%KEY_ROUNDS %1
+%define %%PARALLEL_DATA %2
+%define %%EARLY_LOADS %3
+%define %%MOVDQ %4
+%define %%PXOR %5
+%define %%AES_DEC %6
+%define %%AES_DEC_LAST %7
+%define %%CACHED_KEYS %8
+%define %%TMP %9
+%define %%TMP_CNT %10
+%define %%FIRST_CKEY %11
+%define %%KEY_DATA %12
+%define %%FIRST_XDATA %13
+%define %%IN %14 ; input data
+%define %%OUT %15 ; output data
+%define %%IDX %16 ; index into input and output data buffers
+
+ %assign blocks_loaded 0
+
+ %rep %%EARLY_LOADS
+ SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX ; updates blocks_loaded
+ %endrep ; %%EARLY_LOADS
+
+ %assign current_tmp (TMP)
+ INIT_SELECT_KEY %%KEY_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ %assign round 0
+ %assign key 0
+ %rep KEY_ROUNDS ; for all key rounds
+ SELECT_KEY round, %%KEY_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ %assign i %%FIRST_XDATA
+ %rep %%PARALLEL_DATA ; for each block do the EAS block encode step
+ %if (0 == round)
+ %%PXOR XDATA(i), KEY_REG(key) ; first round's step
+ SCHEDULE_DATA_LOAD %%PARALLEL_DATA, %%EARLY_LOADS, %%MOVDQ, %%IN, %%IDX
+
+ %elif ( (%%KEY_ROUNDS-1) == round )
+ %%AES_DEC_LAST XDATA(i), KEY_REG(key) ; last round's step
+
+ %else
+ %%AES_DEC XDATA(i), KEY_REG(key) ; middle round's (1..last-1) step
+
+ %endif
+ %assign i (i+1)
+ %endrep ;%%PARALLEL_DATA
+ %assign round (round+1)
+ %endrep ;KEY_ROUNDS
+%endmacro ; AES_PARALLEL_ENC_BLOCKS
+
+
+
+;
+; AES_ENC_BLOCKS
+; load first uncached key into TMP0 (if any)
+; AES block encript XDATA(p_first)
+; before using uncached key in TMP0, load next key in TMP1
+; before using uncached key in TMP1, load next key in TMP0
+%macro AES_ENC_BLOCKS 11
+%define %%TOT_ROUNDS %1
+%define %%ENC_BLOCK %2
+%define %%TMP %3
+%define %%TMP_CNT %4
+%define %%FIRST_CKEY %5
+%define %%CACHED_KEYS %6
+%define %%KEY_DATA %7
+%define %%MOVDQ %8
+%define %%PXOR %9
+%define %%AES_ENC %10
+%define %%AES_ENC_LAST %11
+
+ %assign current_tmp (%%TMP)
+ INIT_SELECT_KEY %%TOT_ROUNDS, %%CACHED_KEYS, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ %assign round 0
+ %assign key (round + %%FIRST_CKEY)
+ %rep %%TOT_ROUNDS ; for all key rounds
+ ; find the key data for this round
+ SELECT_KEY round, %%TOT_ROUNDS, %%CACHED_KEYS, %%FIRST_CKEY, %%KEY_DATA, %%TMP, %%TMP_CNT, %%MOVDQ
+
+ ; encrypt block
+ %if (0 == round)
+ %%PXOR XDATA(%%ENC_BLOCK), KEY_REG(key) ; round zero step
+ %elif ( (%%TOT_ROUNDS-1) == round )
+ %%AES_ENC_LAST XDATA(%%ENC_BLOCK), KEY_REG(key) ; last round's step
+ %else
+ %%AES_ENC XDATA(%%ENC_BLOCK), KEY_REG(key) ; rounds 1..last-1 step
+ %endif ; (0 == round)
+
+ %assign round (round+1)
+ %endrep ; KEY_ROUNDS
+%endmacro ; AES_ENC
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm
new file mode 100644
index 000000000..3b8a136e0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x4_sse.asm
@@ -0,0 +1,161 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES by 4
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_128_sse(void *in,
+; uint8_t *IV,
+; uint8_t keys,
+; void *out,
+; uint64_t len_bytes);
+;
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC macros
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (2)
+%define PARALLEL_BLOCKS (8)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_DEC aesdec
+%define AES_DEC_LAST aesdeclast
+%include "cbc_common.asm"
+
+section .text
+
+align 16
+global aes_cbc_dec_128_sse:function
+func(aes_cbc_dec_128_sse)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm
new file mode 100644
index 000000000..a41d3900d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_128_x8_avx.asm
@@ -0,0 +1,161 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; routine to do AES128 CBC decrypt
+;; clobbers xmm0-15
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+ vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+ vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+section .text
+
+;; aes_cbc_dec_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+global aes_cbc_dec_128_avx:function
+func(aes_cbc_dec_128_avx)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm
new file mode 100644
index 000000000..eedff8709
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x4_sse.asm
@@ -0,0 +1,163 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_192_sse(void *in,
+; uint8_t *IV,
+; uint8_t keys[13], // +1 over key length
+; void *out,
+; uint64_t len_bytes);
+;
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;
+
+%include "reg_sizes.asm"
+
+%define MOVDQ movdqu
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (2)
+%define PARALLEL_BLOCKS (5)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_DEC aesdec
+%define AES_DEC_LAST aesdeclast
+
+%include "cbc_common.asm"
+
+section .text
+
+global aes_cbc_dec_192_sse:function
+func(aes_cbc_dec_192_sse)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm
new file mode 100644
index 000000000..3de1cbcaf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_192_x8_avx.asm
@@ -0,0 +1,157 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES192 CBC decrypt
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+ vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+ vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_192_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+global aes_cbc_dec_192_avx:function
+func(aes_cbc_dec_192_avx)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm
new file mode 100644
index 000000000..b6c081ffb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x4_sse.asm
@@ -0,0 +1,160 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES cbc decrypt on 16n bytes doing AES
+; XMM registers are clobbered. Saving/restoring must be done at a higher level
+
+; void aes_cbc_dec_256_sse(void *in,
+; uint8_t *IV,
+; uint8_t keys,
+; void *out,
+; uint64_t len_bytes);
+;
+; arg 1: rcx: pointer to input (cipher text)
+; arg 2: rdx: pointer to IV
+; arg 3: r8: pointer to keys
+; arg 4: r9: pointer to output (plain text)
+; arg 5: sp: length in bytes (multiple of 16)
+;
+
+%include "reg_sizes.asm"
+
+%define MOVDQ movdqu
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_DEC aesdec
+%define AES_DEC_LAST aesdeclast
+
+%include "cbc_common.asm"
+
+global aes_cbc_dec_256_sse:function
+func(aes_cbc_dec_256_sse)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm
new file mode 100644
index 000000000..52efa3f50
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_dec_256_x8_avx.asm
@@ -0,0 +1,157 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; routine to do AES256 CBC decrypt
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN rcx
+%define IV rdx
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+; configuration paramaters for AES-CBC
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%define EARLY_BLOCKS (4)
+%define PARALLEL_BLOCKS (11)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ vmovdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_DEC 2
+ vaesdec %1, %1, %2
+%endm
+
+%macro AES_DEC_LAST 2
+ vaesdeclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; aes_cbc_dec_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes)
+global aes_cbc_dec_256_avx:function
+func(aes_cbc_dec_256_avx)
+ FUNC_SAVE
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+
+ MOVDQ reg(IV_IDX), [IV] ; Load IV for next round of block decrypt
+ mov IDX, 0
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; if enough data blocks remain enter main_loop
+ jmp partials
+
+main_loop:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, PARALLEL_BLOCKS, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ cmp LEN, PARALLEL_BLOCKS*16
+ jge main_loop ; enough blocks to do another full parallel set
+ jz done
+
+partials: ; fewer than 'PARALLEL_BLOCKS' left do in groups of 4, 2 or 1
+ cmp LEN, 0
+ je done
+ cmp LEN, 4*16
+ jge initial_4
+ cmp LEN, 2*16
+ jge initial_2
+
+initial_1:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 1, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jmp done
+
+initial_2:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 2, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jz done
+ jmp partials
+
+initial_4:
+ CBC_DECRYPT_BLOCKS KEY_ROUNDS, 4, EARLY_BLOCKS, MOVDQ, PXOR, AES_DEC, AES_DEC_LAST, CKEY_CNT, TMP, TMP_CNT, FIRST_CKEY, KEYS, FIRST_XDATA, IN, OUT, IDX, LEN
+ jnz partials
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm
new file mode 100644
index 000000000..b3cdd8348
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x4_sb.asm
@@ -0,0 +1,136 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 128 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_128_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t *keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%define UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+%define EARLY_BLOCKS (2)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_ENC aesenc
+%define AES_ENC_LAST aesenclast
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_128_x4:function
+func(aes_cbc_enc_128_x4)
+ FUNC_SAVE
+
+ mov IDX, 0
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm
new file mode 100644
index 000000000..a6be6df52
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_128_x8_sb.asm
@@ -0,0 +1,150 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 128 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x8(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;; clobbers all registers except for ARG1 and rbp
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 11
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+%define IV_CNT (1)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+ vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+ vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_128_x8:function
+func(aes_cbc_enc_128_x8)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm
new file mode 100644
index 000000000..cfaf83bad
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x4_sb.asm
@@ -0,0 +1,148 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+
+;include "mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+%define MOVDQ movdqu ;; assume buffers not aligned
+%macro pxor2 2
+ MOVDQ XTMP, %2
+ pxor %1, XTMP
+%endm
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_192_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_ENC aesenc
+%define AES_ENC_LAST aesenclast
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_192_x4:function
+func(aes_cbc_enc_192_x4)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm
new file mode 100644
index 000000000..ed72bbe9d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_192_x8_sb.asm
@@ -0,0 +1,146 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 192 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_192_x8(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;; clobbers all registers except for ARG1 and rbp
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 13
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+ vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+ vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+global aes_cbc_enc_192_x8:function
+func(aes_cbc_enc_192_x8)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm
new file mode 100644
index 000000000..dd0ea5627
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x4_sb.asm
@@ -0,0 +1,140 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;;; Updates In and Out pointers at end
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%define PXOR pxor
+%define AES_ENC aesenc
+%define AES_ENC_LAST aesenclast
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_256_x4:function
+func(aes_cbc_enc_256_x4)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm
new file mode 100644
index 000000000..74ad399d4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_enc_256_x8_sb.asm
@@ -0,0 +1,147 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;; routine to do a 256 bit CBC AES encrypt
+;; clobbers all registers except for ARG1 and rbp
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Updates In and Out pointers at end
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;void aes_cbc_enc_256_x4(void *in,
+;; uint8_t *IV,
+;; uint8_t keys,
+;; void *out,
+;; uint64_t len_bytes);
+; arg 1: IN: pointer to input (cipher text)
+; arg 2: IV: pointer to IV
+; arg 3: KEYS: pointer to keys
+; arg 4: OUT: pointer to output (plain text)
+; arg 5: LEN: length in bytes (multiple of 16)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define IN0 rdi
+%define IN rdi
+%define IV rsi
+%define KEYS rdx
+%define OUT rcx
+%define LEN r8
+%define KEYS0 rdx
+%define OUT0 rcx
+%define func(x) x:
+%define FUNC_SAVE
+%define FUNC_RESTORE
+%endif
+
+%ifidn __OUTPUT_FORMAT__, win64
+%define IN0 rcx
+%define IN rcx
+%define IV rdx
+%define KEYS0 r8
+%define OUT0 r9
+%define KEYS r8
+%define OUT r9
+%define LEN r10
+%define PS 8
+%define stack_size 10*16 + 1*8 ; must be an odd multiple of 8
+%define arg(x) [rsp + stack_size + PS + PS*x]
+
+%define func(x) proc_frame x
+%macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ end_prolog
+ mov LEN, arg(4)
+%endmacro
+
+%macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ add rsp, stack_size
+%endmacro
+%endif
+
+%define KEY_ROUNDS 15
+%define XMM_USAGE (16)
+%DEFINE UNROLLED_LOOPS (3)
+%define PARALLEL_BLOCKS (UNROLLED_LOOPS)
+
+; instruction set specific operation definitions
+%define MOVDQ movdqu
+%macro PXOR 2
+ vpxor %1, %1, %2
+%endm
+
+%macro AES_ENC 2
+ vaesenc %1, %1, %2
+%endm
+
+%macro AES_ENC_LAST 2
+ vaesenclast %1, %1, %2
+%endm
+
+%include "cbc_common.asm"
+
+
+global aes_cbc_enc_256_x8:function
+func(aes_cbc_enc_256_x8)
+ FUNC_SAVE
+
+ mov IDX, 0
+
+ FILL_KEY_CACHE CKEY_CNT, FIRST_CKEY, KEYS, MOVDQ
+ CBC_ENC_INIT FIRST_XDATA, TMP, MOVDQ, PXOR, IV, IN, IDX
+
+main_loop:
+ CBC_ENC_SUBLOOP KEY_ROUNDS, UNROLLED_LOOPS, FIRST_XDATA, MOVDQ, PXOR, AES_ENC, AES_ENC_LAST, TMP, TMP_CNT, FIRST_CKEY, CKEY_CNT, KEYS, CACHED_KEYS, IN, OUT, IDX, LEN
+ jne main_loop
+
+done:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm
new file mode 100644
index 000000000..fc458ea4b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_multibinary.asm
@@ -0,0 +1,83 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+default rel
+[bits 64]
+
+extern aes_cbc_dec_128_sse
+extern aes_cbc_dec_128_avx
+extern aes_cbc_dec_192_sse
+extern aes_cbc_dec_192_avx
+extern aes_cbc_dec_256_sse
+extern aes_cbc_dec_256_avx
+
+
+extern aes_cbc_enc_128_x4
+extern aes_cbc_enc_128_x8
+extern aes_cbc_enc_192_x4
+extern aes_cbc_enc_192_x8
+extern aes_cbc_enc_256_x4
+extern aes_cbc_enc_256_x8
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aesni_cbc interfaces enc and dec
+;;;;
+mbin_interface aes_cbc_dec_128
+mbin_dispatch_init aes_cbc_dec_128, aes_cbc_dec_128_sse, aes_cbc_dec_128_avx, aes_cbc_dec_128_avx
+mbin_interface aes_cbc_dec_192
+mbin_dispatch_init aes_cbc_dec_192, aes_cbc_dec_192_sse, aes_cbc_dec_192_avx, aes_cbc_dec_192_avx
+mbin_interface aes_cbc_dec_256
+mbin_dispatch_init aes_cbc_dec_256, aes_cbc_dec_256_sse, aes_cbc_dec_256_avx, aes_cbc_dec_256_avx
+
+mbin_interface aes_cbc_enc_128
+mbin_dispatch_init aes_cbc_enc_128, aes_cbc_enc_128_x4, aes_cbc_enc_128_x8, aes_cbc_enc_128_x8
+mbin_interface aes_cbc_enc_192
+mbin_dispatch_init aes_cbc_enc_192, aes_cbc_enc_192_x4, aes_cbc_enc_192_x8, aes_cbc_enc_192_x8
+mbin_interface aes_cbc_enc_256
+mbin_dispatch_init aes_cbc_enc_256, aes_cbc_enc_256_x4, aes_cbc_enc_256_x8, aes_cbc_enc_256_x8
+
+
+
+;;; func core, ver, snum
+slversion aes_cbc_enc_128, 00, 00, 0291
+slversion aes_cbc_dec_128, 00, 00, 0292
+slversion aes_cbc_enc_192, 00, 00, 0293
+slversion aes_cbc_dec_192, 00, 00, 0294
+slversion aes_cbc_enc_256, 00, 00, 0295
+slversion aes_cbc_dec_256, 00, 00, 0296
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c
new file mode 100644
index 000000000..017e523d8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_ossl_perf.c
@@ -0,0 +1,315 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_cbc.h>
+#include <test.h>
+#include "ossl_helper.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static unsigned char const ic[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+ 0x0e, 0x0f
+};
+
+static unsigned char *plaintext, *cbc_plaintext, *cyphertext, *ossl_plaintext,
+ *ossl_cyphertext;
+static uint8_t test_key[CBC_256_BITS];
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ unsigned int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int aes_128_perf(uint8_t * key)
+{
+ int i;
+
+ /* Initialize our cipher context, which can use same input vectors */
+ uint8_t *iv = NULL;
+ struct cbc_key_data *key_data = NULL;
+
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+ if ((NULL == iv) || (NULL == key_data))
+ return 1;
+
+ memcpy(iv, ic, CBC_IV_DATA_LEN);
+
+ aes_cbc_precomp(key, 128, key_data);
+ aes_cbc_enc_128(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+ openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_enc_128(plaintext, iv, key_data->enc_keys,
+ plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_128_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_128_cbc_enc(key, iv, TEST_LEN, plaintext, plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_128_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_dec_128(cyphertext, iv, key_data->dec_keys,
+ cbc_plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_128_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_128_cbc_dec(key, iv, TEST_LEN,
+ ossl_cyphertext, ossl_plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_128_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ printf("\n");
+ return 0;
+}
+
+int aes_192_perf(uint8_t * key)
+{
+ int i;
+ uint8_t *iv = NULL;
+ struct cbc_key_data *key_data = NULL;
+
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+ if ((NULL == iv) || (NULL == key_data))
+ return 1;
+
+ memcpy(iv, ic, CBC_IV_DATA_LEN);
+ aes_cbc_precomp(key, 192, key_data);
+ aes_cbc_enc_192(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+ openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_enc_192(plaintext, iv, key_data->enc_keys,
+ cyphertext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_192_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_192_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_192_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_dec_192(cyphertext, iv, key_data->dec_keys,
+ cbc_plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_192_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_192_cbc_dec(key, iv, TEST_LEN,
+ ossl_cyphertext, ossl_plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_192_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ printf("\n");
+ return 0;
+}
+
+int aes_256_perf(uint8_t * key)
+{
+ int i;
+ uint8_t *iv = NULL;
+ struct cbc_key_data *key_data = NULL;
+
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ posix_memalign((void **)&key_data, 16, (sizeof(*key_data)));
+ if ((NULL == iv) || (NULL == key_data))
+ return 1;
+
+ aes_cbc_precomp(key, 256, key_data);
+ memcpy(iv, ic, CBC_IV_DATA_LEN);
+ aes_cbc_enc_256(plaintext, iv, key_data->enc_keys, cyphertext, TEST_LEN);
+ openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_enc_256(plaintext, iv, key_data->enc_keys,
+ cyphertext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_256 encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_cbc_enc(key, iv, TEST_LEN, plaintext, ossl_cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_256_encode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aes_cbc_dec_256(cyphertext, iv, key_data->dec_keys,
+ cbc_plaintext, TEST_LEN);
+ }
+
+ perf_stop(&stop);
+ printf("ISA-L__aes_cbc_256 decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_cbc_dec(key, iv, TEST_LEN,
+ ossl_cyphertext, ossl_plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("OpenSSL_aes_cbc_256_decode" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ printf("\n");
+ return 0;
+}
+
+int main(void)
+{
+ uint32_t OK = 0;
+
+ srand(TEST_SEED);
+
+ plaintext = malloc(TEST_LEN);
+ cbc_plaintext = malloc(TEST_LEN);
+ cyphertext = malloc(TEST_LEN);
+ ossl_plaintext = malloc(TEST_LEN);
+ ossl_cyphertext = malloc(TEST_LEN);
+ if (NULL == plaintext || NULL == cyphertext || NULL == cbc_plaintext
+ || NULL == ossl_plaintext || NULL == ossl_cyphertext) {
+ printf("malloc of testsize:0x%x failed\n", TEST_LEN);
+ return 1;
+ }
+
+ mk_rand_data(plaintext, TEST_LEN);
+ mk_rand_data(test_key, sizeof(test_key));
+ printf("AES CBC ISA-L vs OpenSSL performance:\n");
+ OK += aes_128_perf(test_key);
+ OK += aes_192_perf(test_key);
+ OK += aes_256_perf(test_key);
+
+ return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c
new file mode 100644
index 000000000..6284d905b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_pre.c
@@ -0,0 +1,56 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <aes_cbc.h>
+#include <aes_keyexp.h>
+
+int aes_cbc_precomp(uint8_t * key, int key_size, struct cbc_key_data *keys_blk)
+{
+ if (CBC_128_BITS == key_size) {
+ aes_keyexp_128(key, keys_blk->enc_keys, keys_blk->dec_keys);
+ } else if (CBC_192_BITS == key_size) {
+ aes_keyexp_192(key, keys_blk->enc_keys, keys_blk->dec_keys);
+ } else if (CBC_256_BITS == key_size) {
+ aes_keyexp_256(key, keys_blk->enc_keys, keys_blk->dec_keys);
+ } else {
+ //Invalid key length
+ return 1;
+ }
+ return 0;
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver aes_cbc_precomp_slver_00000297;
+struct slver aes_cbc_precomp_slver = { 0x0297, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h
new file mode 100644
index 000000000..981aae965
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors.h
@@ -0,0 +1,466 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_CBC_STD_VECTORS_H_
+#define AES_CBC_STD_VECTORS_H_
+#include <aes_cbc.h>
+
+
+// struct to hold pointers to the cbc data vectors
+struct cbc_vector {
+ uint8_t* K; // AES Key
+ cbc_key_size K_LEN; // length of key in bits
+ uint8_t* IV; // initial value used by GCM
+ uint64_t P_LEN; // length of our plaintext
+ uint8_t* P; // Plain text
+ //outputs of encryption
+ uint8_t* EXP_C; // same length as P
+ // used in vector checks, not populated in std vector array
+ uint8_t *C;
+ struct cbc_key_data *KEYS;
+};
+
+
+///////////////////////////////////////////
+// Test vectors from:
+// Intel IPSec library 1..3
+//
+///////////////////////////////////////////
+static unsigned char K1[] = {
+ 0x2b, 0x7e, 0x15, 0x16, 0x28, 0xae, 0xd2, 0xa6, 0xab, 0xf7, 0x15, 0x88, 0x09, 0xcf, 0x4f, 0x3c
+};
+static unsigned char IV1[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P1[] = {
+ 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+ 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+ 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+ 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C1[] = {
+ 0x76, 0x49, 0xab, 0xac, 0x81, 0x19, 0xb2, 0x46, 0xce, 0xe9, 0x8e, 0x9b, 0x12, 0xe9, 0x19, 0x7d,
+ 0x50, 0x86, 0xcb, 0x9b, 0x50, 0x72, 0x19, 0xee, 0x95, 0xdb, 0x11, 0x3a, 0x91, 0x76, 0x78, 0xb2,
+ 0x73, 0xbe, 0xd6, 0xb8, 0xe3, 0xc1, 0x74, 0x3b, 0x71, 0x16, 0xe6, 0x9e, 0x22, 0x22, 0x95, 0x16,
+ 0x3f, 0xf1, 0xca, 0xa1, 0x68, 0x1f, 0xac, 0x09, 0x12, 0x0e, 0xca, 0x30, 0x75, 0x86, 0xe1, 0xa7
+};
+
+static unsigned char K2[] = {
+ 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
+ 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7, 0x2d, 0x98, 0x10, 0xa3, 0x09, 0x14, 0xdf, 0xf4
+};
+static unsigned char IV2[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P2[] = {
+ 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+ 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+ 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+ 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C2[] = {
+ 0xf5, 0x8c, 0x4c, 0x04, 0xd6, 0xe5, 0xf1, 0xba, 0x77, 0x9e, 0xab, 0xfb, 0x5f, 0x7b, 0xfb, 0xd6,
+ 0x9c, 0xfc, 0x4e, 0x96, 0x7e, 0xdb, 0x80, 0x8d, 0x67, 0x9f, 0x77, 0x7b, 0xc6, 0x70, 0x2c, 0x7d,
+ 0x39, 0xf2, 0x33, 0x69, 0xa9, 0xd9, 0xba, 0xcf, 0xa5, 0x30, 0xe2, 0x63, 0x04, 0x23, 0x14, 0x61,
+ 0xb2, 0xeb, 0x05, 0xe2, 0xc3, 0x9b, 0xe9, 0xfc, 0xda, 0x6c, 0x19, 0x07, 0x8c, 0x6a, 0x9d, 0x1b
+};
+
+static unsigned char K3[] = {
+ 0x60, 0x3d, 0xeb, 0x10, 0x15, 0xca, 0x71, 0xbe, 0x2b, 0x73, 0xae, 0xf0, 0x85, 0x7d, 0x77, 0x81,
+ 0x1f, 0x35, 0x2c, 0x07, 0x3b, 0x61, 0x08, 0xd7
+};
+static unsigned char IV3[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+};
+static unsigned char P3[] = {
+ 0x6b, 0xc1, 0xbe, 0xe2, 0x2e, 0x40, 0x9f, 0x96, 0xe9, 0x3d, 0x7e, 0x11, 0x73, 0x93, 0x17, 0x2a,
+ 0xae, 0x2d, 0x8a, 0x57, 0x1e, 0x03, 0xac, 0x9c, 0x9e, 0xb7, 0x6f, 0xac, 0x45, 0xaf, 0x8e, 0x51,
+ 0x30, 0xc8, 0x1c, 0x46, 0xa3, 0x5c, 0xe4, 0x11, 0xe5, 0xfb, 0xc1, 0x19, 0x1a, 0x0a, 0x52, 0xef,
+ 0xf6, 0x9f, 0x24, 0x45, 0xdf, 0x4f, 0x9b, 0x17, 0xad, 0x2b, 0x41, 0x7b, 0xe6, 0x6c, 0x37, 0x10
+};
+static unsigned char C3[] = {
+ 0x17, 0x70, 0x1a, 0x9d, 0x29, 0xc9, 0x1a, 0x94, 0xce, 0xed, 0x72, 0x3c, 0x34, 0xe8,
+ 0x7a, 0xbe, 0x1c, 0x96, 0x84, 0x5c, 0xa8, 0xb7, 0xe8, 0x58, 0x6d, 0xfe, 0xf2, 0xfa,
+ 0x6b, 0xed, 0x24, 0x09, 0x8a, 0x52, 0xce, 0xe8, 0xd7, 0x6d, 0xb6, 0x7b, 0xfd, 0xe2,
+ 0x15, 0x53, 0xd3, 0x1c, 0x28, 0x33, 0xf7, 0x7e, 0xb5, 0x95, 0x00, 0xac, 0x49, 0x03,
+ 0xbc, 0x70, 0x76, 0xb1, 0x84, 0x65, 0xd0, 0xea
+};
+
+///////////////////////////////////////////
+// Test vectors from:
+// 'https://tools.ietf.org/html/rfc3602#section-3.2'
+// The AES-CBC Cipher Algorithm and Its Use with IPsec
+//
+///////////////////////////////////////////
+/*
+Case #1: Encrypting 16 bytes (1 block) using AES-CBC with 128-bit key
+Key : 0x06a9214036b8a15b512e03d534120006
+IV : 0x3dafba429d9eb430b422da802c9fac41
+Plaintext : "Single block msg"
+Ciphertext: 0xe353779c1079aeb82708942dbe77181a
+ *
+ */
+static unsigned char K4[] = {
+ 0x06, 0xa9, 0x21, 0x40, 0x36, 0xb8, 0xa1, 0x5b, 0x51, 0x2e, 0x03, 0xd5, 0x34, 0x12, 0x00, 0x06
+};
+static unsigned char IV4[] = {
+ 0x3d, 0xaf, 0xba, 0x42, 0x9d, 0x9e, 0xb4, 0x30, 0xb4, 0x22, 0xda, 0x80, 0x2c, 0x9f, 0xac, 0x41
+};
+static unsigned char P4[] = {
+ "Single block msg"
+};
+static unsigned char C4[] = {
+ 0xe3, 0x53, 0x77, 0x9c, 0x10, 0x79, 0xae, 0xb8, 0x27, 0x08, 0x94, 0x2d, 0xbe, 0x77, 0x18, 0x1a
+};
+
+/*
+Case #2: Encrypting 32 bytes (2 blocks) using AES-CBC with 128-bit key
+Key : 0xc286696d887c9aa0611bbb3e2025a45a
+IV : 0x562e17996d093d28ddb3ba695a2e6f58
+Plaintext : 0x000102030405060708090a0b0c0d0e0f
+ 101112131415161718191a1b1c1d1e1f
+Ciphertext: 0xd296cd94c2cccf8a3a863028b5e1dc0a
+ 7586602d253cfff91b8266bea6d61ab1
+*/
+static unsigned char K5[] = {
+ 0xc2, 0x86, 0x69, 0x6d, 0x88, 0x7c, 0x9a, 0xa0, 0x61, 0x1b, 0xbb, 0x3e, 0x20, 0x25, 0xa4, 0x5a
+};
+static unsigned char IV5[] = {
+ 0x56, 0x2e, 0x17, 0x99, 0x6d, 0x09, 0x3d, 0x28, 0xdd, 0xb3, 0xba, 0x69, 0x5a, 0x2e, 0x6f, 0x58
+};
+static unsigned char P5[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
+ 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b,
+ 0x1c, 0x1d, 0x1e, 0x1f
+};
+static unsigned char C5[] = {
+ 0xd2, 0x96, 0xcd, 0x94, 0xc2, 0xcc, 0xcf, 0x8a, 0x3a, 0x86, 0x30, 0x28, 0xb5, 0xe1,
+ 0xdc, 0x0a, 0x75, 0x86, 0x60, 0x2d, 0x25, 0x3c, 0xff, 0xf9, 0x1b, 0x82, 0x66, 0xbe,
+ 0xa6, 0xd6, 0x1a, 0xb1
+};
+
+/*
+Case #3: Encrypting 48 bytes (3 blocks) using AES-CBC with 128-bit key
+Key : 0x6c3ea0477630ce21a2ce334aa746c2cd
+IV : 0xc782dc4c098c66cbd9cd27d825682c81
+Plaintext : "This is a 48-byte message (exactly 3 AES blocks)"
+Ciphertext: 0xd0a02b3836451753d493665d33f0e886
+ 2dea54cdb293abc7506939276772f8d5
+ 021c19216bad525c8579695d83ba2684
+
+ */
+static unsigned char K6[] = {
+ 0x6c, 0x3e, 0xa0, 0x47, 0x76, 0x30, 0xce, 0x21, 0xa2, 0xce, 0x33, 0x4a, 0xa7, 0x46, 0xc2, 0xcd
+};
+static unsigned char IV6[] = {
+ 0xc7, 0x82, 0xdc, 0x4c, 0x09, 0x8c, 0x66, 0xcb, 0xd9, 0xcd, 0x27, 0xd8, 0x25, 0x68, 0x2c, 0x81
+};
+static unsigned char P6[] = {
+ "This is a 48-byte message (exactly 3 AES blocks)"
+};
+static unsigned char C6[] = {
+ 0xd0, 0xa0, 0x2b, 0x38, 0x36, 0x45, 0x17, 0x53, 0xd4, 0x93, 0x66, 0x5d, 0x33, 0xf0, 0xe8, 0x86,
+ 0x2d, 0xea, 0x54, 0xcd, 0xb2, 0x93, 0xab, 0xc7, 0x50, 0x69, 0x39, 0x27, 0x67, 0x72, 0xf8, 0xd5,
+ 0x02, 0x1c, 0x19, 0x21, 0x6b, 0xad, 0x52, 0x5c, 0x85, 0x79, 0x69, 0x5d, 0x83, 0xba, 0x26, 0x84
+};
+
+/*
+Case #4: Encrypting 64 bytes (4 blocks) using AES-CBC with 128-bit key
+Key : 0x56e47a38c5598974bc46903dba290349
+IV : 0x8ce82eefbea0da3c44699ed7db51b7d9
+Plaintext : 0xa0a1a2a3a4a5a6a7a8a9aaabacadaeaf
+ b0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+ d0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+Ciphertext: 0xc30e32ffedc0774e6aff6af0869f71aa
+ 0f3af07a9a31a9c684db207eb0ef8e4e
+ 35907aa632c3ffdf868bb7b29d3d46ad
+ 83ce9f9a102ee99d49a53e87f4c3da55
+ */
+static unsigned char K7[] = {
+ 0x56, 0xe4, 0x7a, 0x38, 0xc5, 0x59, 0x89, 0x74, 0xbc, 0x46, 0x90, 0x3d, 0xba, 0x29, 0x03, 0x49
+};
+static unsigned char IV7[] = {
+ 0x8c, 0xe8, 0x2e, 0xef, 0xbe, 0xa0, 0xda, 0x3c, 0x44, 0x69, 0x9e, 0xd7, 0xdb, 0x51, 0xb7, 0xd9
+};
+static unsigned char P7[] = {
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf
+};
+static unsigned char C7[] = {
+ 0xc3, 0x0e, 0x32, 0xff, 0xed, 0xc0, 0x77, 0x4e, 0x6a, 0xff, 0x6a, 0xf0, 0x86, 0x9f, 0x71, 0xaa,
+ 0x0f, 0x3a, 0xf0, 0x7a, 0x9a, 0x31, 0xa9, 0xc6, 0x84, 0xdb, 0x20, 0x7e, 0xb0, 0xef, 0x8e, 0x4e,
+ 0x35, 0x90, 0x7a, 0xa6, 0x32, 0xc3, 0xff, 0xdf, 0x86, 0x8b, 0xb7, 0xb2, 0x9d, 0x3d, 0x46, 0xad,
+ 0x83, 0xce, 0x9f, 0x9a, 0x10, 0x2e, 0xe9, 0x9d, 0x49, 0xa5, 0x3e, 0x87, 0xf4, 0xc3, 0xda, 0x55
+};
+
+/*
+Case #5: Sample transport-mode ESP packet (ping 192.168.123.100)
+Key: 90d382b4 10eeba7a d938c46c ec1a82bf
+SPI: 4321
+Source address: 192.168.123.3
+Destination address: 192.168.123.100
+Sequence number: 1
+IV: e96e8c08 ab465763 fd098d45 dd3ff893
+
+Original packet:
+IP header (20 bytes): 45000054 08f20000 4001f9fe c0a87b03 c0a87b64
+Data (64 bytes):
+08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+
+Augment data with:
+Padding: 01020304 05060708 090a0b0c 0d0e
+Pad length: 0e
+Next header: 01 (ICMP)
+
+Pre-encryption Data with padding, pad length and next header (80 bytes):
+08000ebd a70a0000 8e9c083d b95b0700 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+01020304 05060708 090a0b0c 0d0e0e01
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500007c 08f20000 4032f9a5 c0a87b03 c0a87b64
+SPI/Seq #: 00004321 00000001
+IV: e96e8c08 ab465763 fd098d45 dd3ff893
+Encrypted Data (80 bytes):
+f663c25d 325c18c6 a9453e19 4e120849 a4870b66 cc6b9965 330013b4 898dc856
+a4699e52 3a55db08 0b59ec3a 8e4b7e52 775b07d1 db34ed9c 538ab50c 551b874a
+a269add0 47ad2d59 13ac19b7 cfbad4a6
+*/
+static unsigned char K8[] = {
+ 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf
+};
+static unsigned char IV8[] = {
+ 0xe9, 0x6e, 0x8c, 0x08, 0xab, 0x46, 0x57, 0x63, 0xfd, 0x09, 0x8d, 0x45, 0xdd, 0x3f, 0xf8, 0x93
+};
+static unsigned char P8[] = {
+ 0x08, 0x00, 0x0e, 0xbd, 0xa7, 0x0a, 0x00, 0x00, 0x8e, 0x9c, 0x08, 0x3d, 0xb9, 0x5b, 0x07, 0x00,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0e, 0x01
+};
+static unsigned char C8[] = {
+ 0xf6, 0x63, 0xc2, 0x5d, 0x32, 0x5c, 0x18, 0xc6, 0xa9, 0x45, 0x3e, 0x19, 0x4e, 0x12, 0x08, 0x49,
+ 0xa4, 0x87, 0x0b, 0x66, 0xcc, 0x6b, 0x99, 0x65, 0x33, 0x00, 0x13, 0xb4, 0x89, 0x8d, 0xc8, 0x56,
+ 0xa4, 0x69, 0x9e, 0x52, 0x3a, 0x55, 0xdb, 0x08, 0x0b, 0x59, 0xec, 0x3a, 0x8e, 0x4b, 0x7e, 0x52,
+ 0x77, 0x5b, 0x07, 0xd1, 0xdb, 0x34, 0xed, 0x9c, 0x53, 0x8a, 0xb5, 0x0c, 0x55, 0x1b, 0x87, 0x4a,
+ 0xa2, 0x69, 0xad, 0xd0, 0x47, 0xad, 0x2d, 0x59, 0x13, 0xac, 0x19, 0xb7, 0xcf, 0xba, 0xd4, 0xa6
+};
+
+/*
+Case #6: Sample transport-mode ESP packet
+ (ping -p 77 -s 20 192.168.123.100)
+Key: 90d382b4 10eeba7a d938c46c ec1a82bf
+SPI: 4321
+Source address: 192.168.123.3
+Destination address: 192.168.123.100
+Sequence number: 8
+IV: 69d08df7 d203329d b093fc49 24e5bd80
+
+Original packet:
+IP header (20 bytes): 45000030 08fe0000 4001fa16 c0a87b03 c0a87b64
+Data (28 bytes):
+0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777
+
+Augment data with:
+Padding: 0102
+Pad length: 02
+Next header: 01 (ICMP)
+
+Pre-encryption Data with padding, pad length and next header (32 bytes):
+0800b5e8 a80a0500 a69c083d 0b660e00 77777777 77777777 77777777 01020201
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500004c 08fe0000 4032f9c9 c0a87b03 c0a87b64
+SPI/Seq #: 00004321 00000008
+IV: 69d08df7 d203329d b093fc49 24e5bd80
+Encrypted Data (32 bytes):
+f5199588 1ec4e0c4 488987ce 742e8109 689bb379 d2d750c0 d915dca3 46a89f75
+ */
+static unsigned char K9[] = {
+ 0x90, 0xd3, 0x82, 0xb4, 0x10, 0xee, 0xba, 0x7a, 0xd9, 0x38, 0xc4, 0x6c, 0xec, 0x1a, 0x82, 0xbf
+};
+static unsigned char IV9[] = {
+ 0x69, 0xd0, 0x8d, 0xf7, 0xd2, 0x03, 0x32, 0x9d, 0xb0, 0x93, 0xfc, 0x49, 0x24, 0xe5, 0xbd, 0x80
+};
+static unsigned char P9[] = {
+ 0x08, 0x00, 0xb5, 0xe8, 0xa8, 0x0a, 0x05, 0x00, 0xa6, 0x9c, 0x08, 0x3d, 0x0b, 0x66, 0x0e, 0x00,
+ 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x77, 0x01, 0x02, 0x02, 0x01
+};
+static unsigned char C9[] = {
+ 0xf5, 0x19, 0x95, 0x88, 0x1e, 0xc4, 0xe0, 0xc4, 0x48, 0x89, 0x87, 0xce, 0x74, 0x2e, 0x81, 0x09,
+ 0x68, 0x9b, 0xb3, 0x79, 0xd2, 0xd7, 0x50, 0xc0, 0xd9, 0x15, 0xdc, 0xa3, 0x46, 0xa8, 0x9f, 0x75
+};
+
+/*
+Case #7: Sample tunnel-mode ESP packet (ping 192.168.123.200)
+Key: 01234567 89abcdef 01234567 89abcdef
+SPI: 8765
+Source address: 192.168.123.3
+Destination address: 192.168.123.200
+Sequence number: 2
+IV: f4e76524 4f6407ad f13dc138 0f673f37
+
+Original packet:
+IP header (20 bytes): 45000054 09040000 4001f988 c0a87b03 c0a87bc8
+Data (64 bytes):
+08009f76 a90a0100 b49c083d 02a20400 08090a0b 0c0d0e0f 10111213 14151617
+18191a1b 1c1d1e1f 20212223 24252627 28292a2b 2c2d2e2f 30313233 34353637
+
+Augment data with:
+Padding: 01020304 05060708 090a
+Pad length: 0a
+Next header: 04 (IP-in-IP)
+
+Pre-encryption Data with original IP header, padding, pad length and
+ next header (96 bytes):
+45000054 09040000 4001f988 c0a87b03 c0a87bc8 08009f76 a90a0100 b49c083d
+02a20400 08090a0b 0c0d0e0f 10111213 14151617 18191a1b 1c1d1e1f 20212223
+24252627 28292a2b 2c2d2e2f 30313233 34353637 01020304 05060708 090a0a04
+
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500008c 09050000 4032f91e c0a87b03 c0a87bc8
+SPI/Seq #: 00008765 00000002
+IV: f4e76524 4f6407ad f13dc138 0f673f37
+Encrypted Data (96 bytes):
+773b5241 a4c44922 5e4f3ce5 ed611b0c 237ca96c f74a9301 3c1b0ea1 a0cf70f8
+e4ecaec7 8ac53aad 7a0f022b 859243c6 47752e94 a859352b 8a4d4d2d ecd136e5
+c177f132 ad3fbfb2 201ac990 4c74ee0a 109e0ca1 e4dfe9d5 a100b842 f1c22f0d
+ */
+static unsigned char K10[] = {
+ 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef
+};
+static unsigned char IV10[] = {
+ 0xf4, 0xe7, 0x65, 0x24, 0x4f, 0x64, 0x07, 0xad, 0xf1, 0x3d, 0xc1, 0x38, 0x0f, 0x67, 0x3f, 0x37
+};
+static unsigned char P10[] = {
+ 0x45, 0x00, 0x00, 0x54, 0x09, 0x04, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x88, 0xc0, 0xa8, 0x7b, 0x03,
+ 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0x9f, 0x76, 0xa9, 0x0a, 0x01, 0x00, 0xb4, 0x9c, 0x08, 0x3d,
+ 0x02, 0xa2, 0x04, 0x00, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13,
+ 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
+ 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33,
+ 0x34, 0x35, 0x36, 0x37, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04
+
+};
+static unsigned char C10[] = {
+ 0x77, 0x3b, 0x52, 0x41, 0xa4, 0xc4, 0x49, 0x22, 0x5e, 0x4f, 0x3c, 0xe5, 0xed, 0x61, 0x1b, 0x0c,
+ 0x23, 0x7c, 0xa9, 0x6c, 0xf7, 0x4a, 0x93, 0x01, 0x3c, 0x1b, 0x0e, 0xa1, 0xa0, 0xcf, 0x70, 0xf8,
+ 0xe4, 0xec, 0xae, 0xc7, 0x8a, 0xc5, 0x3a, 0xad, 0x7a, 0x0f, 0x02, 0x2b, 0x85, 0x92, 0x43, 0xc6,
+ 0x47, 0x75, 0x2e, 0x94, 0xa8, 0x59, 0x35, 0x2b, 0x8a, 0x4d, 0x4d, 0x2d, 0xec, 0xd1, 0x36, 0xe5,
+ 0xc1, 0x77, 0xf1, 0x32, 0xad, 0x3f, 0xbf, 0xb2, 0x20, 0x1a, 0xc9, 0x90, 0x4c, 0x74, 0xee, 0x0a,
+ 0x10, 0x9e, 0x0c, 0xa1, 0xe4, 0xdf, 0xe9, 0xd5, 0xa1, 0x00, 0xb8, 0x42, 0xf1, 0xc2, 0x2f, 0x0d
+};
+
+/*
+Case #8: Sample tunnel-mode ESP packet
+ (ping -p ff -s 40 192.168.123.200)
+Key: 01234567 89abcdef 01234567 89abcdef
+SPI: 8765
+Source address: 192.168.123.3
+Destination address: 192.168.123.200
+Sequence number: 5
+IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22
+
+Original packet:
+IP header (20 bytes): 45000044 090c0000 4001f990 c0a87b03 c0a87bc8
+Data (48 bytes):
+0800d63c aa0a0200 c69c083d a3de0300 ffffffff ffffffff ffffffff ffffffff
+ffffffff ffffffff ffffffff ffffffff
+
+Augment data with:
+Padding: 01020304 05060708 090a
+Pad length: 0a
+Next header: 04 (IP-in-IP)
+
+Pre-encryption Data with original IP header, padding, pad length and
+ next header (80 bytes):
+45000044 090c0000 4001f990 c0a87b03 c0a87bc8 0800d63c aa0a0200 c69c083d
+a3de0300 ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff ffffffff
+ffffffff 01020304 05060708 090a0a04
+
+Post-encryption packet with SPI, Sequence number, IV:
+IP header: 4500007c 090d0000 4032f926 c0a87b03 c0a87bc8
+SPI/Seq #: 00008765 00000005
+IV: 85d47224 b5f3dd5d 2101d4ea 8dffab22
+Encrypted Data (80 bytes):
+15b92683 819596a8 047232cc 00f7048f e45318e1 1f8a0f62 ede3c3fc 61203bb5
+0f980a08 c9843fd3 a1b06d5c 07ff9639 b7eb7dfb 3512e5de 435e7207 ed971ef3
+d2726d9b 5ef6affc 6d17a0de cbb13892
+ */
+static unsigned char K11[] = {
+ 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef, 0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef
+};
+static unsigned char IV11[] = {
+ 0x85, 0xd4, 0x72, 0x24, 0xb5, 0xf3, 0xdd, 0x5d, 0x21, 0x01, 0xd4, 0xea, 0x8d, 0xff, 0xab, 0x22
+};
+static unsigned char P11[] = {
+ 0x45, 0x00, 0x00, 0x44, 0x09, 0x0c, 0x00, 0x00, 0x40, 0x01, 0xf9, 0x90, 0xc0, 0xa8, 0x7b, 0x03,
+ 0xc0, 0xa8, 0x7b, 0xc8, 0x08, 0x00, 0xd6, 0x3c, 0xaa, 0x0a, 0x02, 0x00, 0xc6, 0x9c, 0x08, 0x3d,
+ 0xa3, 0xde, 0x03, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0a, 0x04
+};
+static unsigned char C11[] = {
+ 0x15, 0xb9, 0x26, 0x83, 0x81, 0x95, 0x96, 0xa8, 0x04, 0x72, 0x32, 0xcc, 0x00, 0xf7, 0x04, 0x8f,
+ 0xe4, 0x53, 0x18, 0xe1, 0x1f, 0x8a, 0x0f, 0x62, 0xed, 0xe3, 0xc3, 0xfc, 0x61, 0x20, 0x3b, 0xb5,
+ 0x0f, 0x98, 0x0a, 0x08, 0xc9, 0x84, 0x3f, 0xd3, 0xa1, 0xb0, 0x6d, 0x5c, 0x07, 0xff, 0x96, 0x39,
+ 0xb7, 0xeb, 0x7d, 0xfb, 0x35, 0x12, 0xe5, 0xde, 0x43, 0x5e, 0x72, 0x07, 0xed, 0x97, 0x1e, 0xf3,
+ 0xd2, 0x72, 0x6d, 0x9b, 0x5e, 0xf6, 0xaf, 0xfc, 0x6d, 0x17, 0xa0, 0xde, 0xcb, 0xb1, 0x38, 0x92
+};
+
+
+#define min_size(a, b) (((a)<(b))?(a):(b))
+// Plain and cypher text will be the same size
+// Those vectors using strings for plain text have an extra null terminator that needs
+// to be ignored
+#define vect_size(P, C) (min_size((sizeof(P)),(sizeof(C))))
+#define CBC_KEY_LEN(kdata) (sizeof(kdata))
+
+//field order {K, Klen, IV, Plen, P, C};
+#define vector(N) {K##N, (CBC_KEY_LEN(K##N)), IV##N, vect_size(P##N,C##N), P##N, C##N, NULL, NULL, /*NULL, NULL*/}
+struct cbc_vector const cbc_vectors[] = {
+ vector(1),
+ vector(2),
+ vector(3),
+ vector(4),
+ vector(5),
+ vector(6),
+ vector(7),
+ vector(8),
+ vector(9),
+ vector(10),
+ vector(11),
+};
+
+#endif /* AES_CBC_STD_VECTORS_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c
new file mode 100644
index 000000000..4af562072
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_random_test.c
@@ -0,0 +1,443 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <aes_cbc.h>
+#include "types.h"
+#include "ossl_helper.h"
+#include "cbc_std_vectors.h"
+
+//define CBC_VECTORS_VERBOSE
+//define CBC_VECTORS_EXTRA_VERBOSE
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS 100
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN (8*1024*1024)
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN (4*1024)
+#endif
+#ifndef MAX_UNALINED
+# define MAX_UNALINED (16)
+#endif
+
+static cbc_key_size const Ksize[] = { CBC_128_BITS, CBC_192_BITS, CBC_256_BITS };
+
+typedef void (*aes_cbc_generic) (uint8_t * in,
+ uint8_t * IV,
+ uint8_t * keys, uint8_t * out, uint64_t len_bytes);
+
+int OpenSslEnc(uint8_t k_len,
+ uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes)
+{
+ if (CBC_128_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL128 ");
+#endif
+ openssl_aes_128_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_192_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL192 ");
+#endif
+ openssl_aes_192_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_256_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL256 ");
+ fflush(0);
+#endif
+ openssl_aes_256_cbc_enc(key, (uint8_t *) iv, len_bytes, in, out);
+ } else {
+ fprintf(stderr, "Invalid key length: %d\n", k_len);
+ return 1;
+ }
+ return 0;
+}
+
+int OpenSslDec(uint8_t k_len,
+ uint8_t * key, uint8_t * in, uint8_t * iv, uint8_t * out, uint64_t len_bytes)
+{
+ if (CBC_128_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL128 ");
+#endif
+ openssl_aes_128_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_192_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL192 ");
+#endif
+ openssl_aes_192_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+ } else if (CBC_256_BITS == k_len) {
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" OpenSSL256 ");
+#endif
+ openssl_aes_256_cbc_dec(key, (uint8_t *) iv, len_bytes, in, out);
+ } else {
+ fprintf(stderr, "Invalid key length: %d\n", k_len);
+ return 1;
+ }
+ return 0;
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+ uint64_t a;
+
+ mismatch = memcmp(test, expected, len);
+ if (!mismatch) {
+ return OK;
+
+ } else {
+ OK = 1;
+ printf(" failed %s \t\t", data_name);
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct cbc_vector *vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ int OK = 0;
+ aes_cbc_generic enc;
+ aes_cbc_generic dec;
+
+#ifdef CBC_VECTORS_VERBOSE
+ printf(" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN);
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C,
+ vector->IV, vector->EXP_C, vector->KEYS);
+#endif
+ fflush(0);
+#else
+ printf(".");
+#endif
+
+ if (CBC_128_BITS == vector->K_LEN) {
+ enc = (aes_cbc_generic) & aes_cbc_enc_128;
+ dec = (aes_cbc_generic) & aes_cbc_dec_128;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" CBC128 ");
+#endif
+ } else if (CBC_192_BITS == vector->K_LEN) {
+ enc = (aes_cbc_generic) & aes_cbc_enc_192;
+ dec = (aes_cbc_generic) & aes_cbc_dec_192;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" CBC192 ");
+#endif
+ } else if (CBC_256_BITS == vector->K_LEN) {
+ enc = (aes_cbc_generic) & aes_cbc_enc_256;
+ dec = (aes_cbc_generic) & aes_cbc_dec_256;
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" CBC256 ");
+#endif
+ } else {
+ printf("Invalid key length: %d\n", vector->K_LEN);
+ return 1;
+ }
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->P_LEN);
+ o_ct_test = malloc(vector->P_LEN);
+ if ((pt_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+
+ aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS);
+
+#ifdef CBC_VECTORS_VERBOSE
+ fflush(0);
+#endif
+ ////
+ // ISA-l Encrypt
+ ////
+ enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN);
+ if (NULL != vector->EXP_C) { //when the encrypted text is know verify correct
+ OK |=
+ check_data(vector->EXP_C, vector->C, vector->P_LEN,
+ "ISA-L expected cypher text (C)");
+ }
+ OpenSslEnc(vector->K_LEN, vector->K, vector->P, vector->IV, o_ct_test, vector->P_LEN);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->P_LEN,
+ "OpenSSL vs ISA-L cypher text (C)");
+
+ memcpy(pt_test, vector->P, vector->P_LEN);
+ memset(vector->P, 0, vector->P_LEN);
+#ifdef CBC_VECTORS_VERBOSE
+ fflush(0);
+#endif
+
+ ////
+ // ISA-l Decrypt
+ ////
+ dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->P_LEN);
+ dec(o_ct_test, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted OpenSSL (P)");
+ memset(vector->P, 0, vector->P_LEN);
+ OpenSslDec(vector->K_LEN, vector->K, vector->C, vector->IV, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "OpenSSL decrypted ISA-L (P)");
+#ifdef CBC_VECTORS_VERBOSE
+ if (OK)
+ printf("Failed");
+ else
+ printf("Passed");
+
+ printf("\n");
+#endif
+
+ return OK;
+}
+
+int test_std_combinations(void)
+{
+ int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]);
+ int i;
+ uint8_t *iv = NULL;
+
+ printf("AES CBC standard test vectors:");
+#ifdef CBC_VECTORS_VERBOSE
+ printf("\n");
+#endif
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ if (NULL == iv)
+ return 1;
+
+ for (i = 0; (i < vectors_cnt); i++) {
+ struct cbc_vector vect = cbc_vectors[i];
+
+ posix_memalign((void **)&vect.KEYS, 16, (sizeof(*vect.KEYS)));
+ if (NULL == vect.KEYS)
+ return 1;
+ // IV data must be aligned to 16 byte boundary so move data in aligned buffer and change out the pointer
+ memcpy(iv, vect.IV, CBC_IV_DATA_LEN);
+ vect.IV = iv;
+ vect.C = NULL;
+ vect.C = malloc(vect.P_LEN);
+ if ((NULL == vect.C))
+ return 1;
+#ifdef CBC_VECTORS_VERBOSE
+ printf("vector[%d of %d] ", i, vectors_cnt);
+#endif
+ if (0 == (i % 25))
+ printf("\n");
+ if (0 == (i % 10))
+ fflush(0);
+
+ if (0 != check_vector(&vect))
+ return 1;
+
+ aligned_free(vect.KEYS);
+ free(vect.C);
+ }
+
+ aligned_free(iv);
+ printf("\n");
+ return 0;
+}
+
+int test_random_combinations(void)
+{
+ struct cbc_vector test;
+ int t;
+
+ printf("AES CBC random test vectors:");
+#ifdef CBC_VECTORS_VERBOSE
+ fflush(0);
+#endif
+ test.IV = NULL;
+ posix_memalign((void **)&test.IV, 16, (CBC_IV_DATA_LEN));
+ if (NULL == test.IV)
+ return 1;
+ test.KEYS = NULL;
+ posix_memalign((void **)&test.KEYS, 16, (sizeof(*test.KEYS)));
+ if (NULL == test.KEYS)
+ return 1;
+
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = 16 + ((rand() % TEST_LEN) & ~0xf); //must be a 16byte multiple
+ int offset = (rand() % MAX_UNALINED);
+ int Kindex = (rand() % (sizeof(Ksize) / sizeof(Ksize[0]))); // select one of the valid key sizes
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+
+ test.C = NULL;
+ test.P = NULL;
+ test.K = NULL;
+ test.EXP_C = NULL;
+ test.P_LEN = Plen;
+ test.K_LEN = Ksize[Kindex];
+
+ test.P = malloc(test.P_LEN + offset);
+ test.C = malloc(test.P_LEN + offset);
+ test.K = malloc(test.K_LEN + offset);
+ if ((NULL == test.P) || (NULL == test.C) || (NULL == test.K)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return -1;
+ }
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+
+ mk_rand_data(test.P, test.P_LEN);
+ mk_rand_data(test.K, test.K_LEN);
+ mk_rand_data(test.IV, CBC_IV_DATA_LEN);
+
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" Offset:0x%x ", offset);
+#endif
+ if (0 != check_vector(&test))
+ return 1;
+
+ test.C -= offset;
+ free(test.C);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ }
+
+ aligned_free(test.IV);
+ aligned_free(test.KEYS);
+ printf("\n");
+ return 0;
+}
+
+int test_efence_combinations(void)
+{
+ struct cbc_vector test;
+ int offset = 0;
+ int key_idx;
+ uint8_t *P = NULL, *C = NULL, *K = NULL, *IV = NULL;
+ uint8_t *key_data = NULL;
+
+ P = malloc(PAGE_LEN);
+ C = malloc(PAGE_LEN);
+ K = malloc(PAGE_LEN);
+ IV = malloc(PAGE_LEN);
+ key_data = malloc(PAGE_LEN);
+
+ if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV)
+ || (NULL == key_data)
+ ) {
+ printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+ return -1;
+ }
+ // place buffers to end at page boundary
+ test.P_LEN = PAGE_LEN / 2;
+ test.EXP_C = NULL;
+
+ printf("AES CBC efence test vectors:");
+ for (key_idx = 0; key_idx < (sizeof(Ksize) / sizeof(Ksize[0])); key_idx++) {
+ test.K_LEN = Ksize[key_idx];
+
+ for (offset = 0; MAX_UNALINED > offset; offset++) {
+ if (0 == (offset % 80))
+ printf("\n");
+ // move the start and size of the data block towards the end of the page
+ test.P_LEN = ((PAGE_LEN / (1 + (2 * offset))) & ~0xff); // must be a multiple of 16
+ if (16 > test.P_LEN)
+ test.P_LEN = 16;
+ //Place data at end of page
+ test.P = P + PAGE_LEN - test.P_LEN - offset;
+ test.C = C + PAGE_LEN - test.P_LEN - offset;
+ test.K = K + PAGE_LEN - test.K_LEN - offset;
+ test.IV = IV + PAGE_LEN - CBC_IV_DATA_LEN - offset;
+ test.IV = test.IV - ((uint64_t) test.IV & 0xff); // align to 16 byte boundary
+ test.KEYS = (struct cbc_key_data *)
+ (key_data + PAGE_LEN - sizeof(*test.KEYS) - offset);
+ test.KEYS = (struct cbc_key_data *)
+ ((uint8_t *) test.KEYS - ((uint64_t) test.KEYS & 0xff)); // align to 16 byte boundary
+
+ mk_rand_data(test.P, test.P_LEN);
+ mk_rand_data(test.K, test.K_LEN);
+ mk_rand_data(test.IV, CBC_IV_DATA_LEN);
+#ifdef CBC_VECTORS_EXTRA_VERBOSE
+ printf(" Offset:0x%x ", offset);
+#endif
+ if (0 != check_vector(&test))
+ return 1;
+ }
+
+ }
+
+ free(P);
+ free(C);
+ free(K);
+ free(IV);
+ free(key_data);
+ printf("\n");
+ return 0;
+}
+
+int main(void)
+{
+ uint32_t OK = 0;
+
+ srand(TEST_SEED);
+ OK |= test_std_combinations();
+ OK |= test_random_combinations();
+ OK |= test_efence_combinations();
+ if (0 == OK) {
+ printf("...Pass\n");
+ } else {
+ printf("...Fail\n");
+ }
+ return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c
new file mode 100644
index 000000000..fef096e48
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/cbc_std_vectors_test.c
@@ -0,0 +1,183 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * Run list of standard CBC test vectors through encode and decode checks.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <aes_cbc.h>
+#include "types.h"
+#include "cbc_std_vectors.h"
+
+typedef void (*aes_cbc_generic) (uint8_t * in, uint8_t * IV, uint8_t * keys, uint8_t * out,
+ uint64_t len_bytes);
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+ uint64_t a;
+
+ mismatch = memcmp(test, expected, len);
+ if (!mismatch) {
+ return OK;
+
+ } else {
+ OK = 1;
+ printf(" failed %s \t\t", data_name);
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct cbc_vector *vector)
+{
+ uint8_t *pt_test = NULL;
+ int OK = 0;
+ aes_cbc_generic enc;
+ aes_cbc_generic dec;
+
+ DEBUG_PRINT((" Keylen:%d PLen:%d ", (int)vector->K_LEN, (int)vector->P_LEN));
+ DEBUG_PRINT((" K:%p P:%p C:%p IV:%p expC:%p Keys:%p ", vector->K, vector->P, vector->C,
+ vector->IV, vector->EXP_C, vector->KEYS));
+ printf(".");
+
+ switch (vector->K_LEN) {
+ case CBC_128_BITS:
+ enc = (aes_cbc_generic) & aes_cbc_enc_128;
+ dec = (aes_cbc_generic) & aes_cbc_dec_128;
+ DEBUG_PRINT((" CBC128 "));
+ break;
+ case CBC_192_BITS:
+ enc = (aes_cbc_generic) & aes_cbc_enc_192;
+ dec = (aes_cbc_generic) & aes_cbc_dec_192;
+ DEBUG_PRINT((" CBC192 "));
+ break;
+ case CBC_256_BITS:
+ enc = (aes_cbc_generic) & aes_cbc_enc_256;
+ dec = (aes_cbc_generic) & aes_cbc_dec_256;
+ DEBUG_PRINT((" CBC256 "));
+ break;
+ default:
+ printf("Invalid key length: %d\n", vector->K_LEN);
+ return 1;
+ }
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->P_LEN);
+
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+
+ aes_cbc_precomp(vector->K, vector->K_LEN, vector->KEYS);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ enc(vector->P, vector->IV, vector->KEYS->enc_keys, vector->C, vector->P_LEN);
+
+ if (NULL != vector->EXP_C) { //when the encrypted text is known verify correct
+ OK |= check_data(vector->EXP_C, vector->C, vector->P_LEN,
+ "ISA-L expected cypher text (C)");
+ }
+ memcpy(pt_test, vector->P, vector->P_LEN);
+ memset(vector->P, 0, vector->P_LEN);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ dec(vector->C, vector->IV, vector->KEYS->dec_keys, vector->P, vector->P_LEN);
+ OK |= check_data(vector->P, pt_test, vector->P_LEN, "ISA-L decrypted plain text (P)");
+ DEBUG_PRINT((OK ? "Failed\n" : "Passed\n"));
+
+ free(pt_test);
+ return OK;
+}
+
+int test_std_combinations(void)
+{
+ int const vectors_cnt = sizeof(cbc_vectors) / sizeof(cbc_vectors[0]);
+ int i;
+ uint8_t *iv = NULL;
+
+ printf("AES CBC standard test vectors: ");
+
+ posix_memalign((void **)&iv, 16, (CBC_IV_DATA_LEN));
+ if (NULL == iv)
+ return 1;
+
+ for (i = 0; (i < vectors_cnt); i++) {
+ struct cbc_vector vect = cbc_vectors[i];
+
+ posix_memalign((void **)&(vect.KEYS), 16, sizeof(*vect.KEYS));
+ if (NULL == vect.KEYS)
+ return 1;
+
+ // IV data must be aligned to 16 byte boundary so move data in
+ // aligned buffer and change out the pointer
+ memcpy(iv, vect.IV, CBC_IV_DATA_LEN);
+ vect.IV = iv;
+ vect.C = malloc(vect.P_LEN);
+ if (NULL == vect.C)
+ return 1;
+
+ DEBUG_PRINT(("vector[%d of %d] ", i, vectors_cnt));
+
+ if (0 != check_vector(&vect))
+ return 1;
+
+ aligned_free(vect.KEYS);
+ free(vect.C);
+ }
+
+ aligned_free(iv);
+ return 0;
+}
+
+int main(void)
+{
+ uint32_t OK = 0;
+
+ OK = test_std_combinations();
+
+ printf(0 == OK ? "Pass\n" : "Fail\n");
+ return OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm
new file mode 100644
index 000000000..62c3e344c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen2.asm
@@ -0,0 +1,1996 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba
+ vpshufd %%T2, %%GH, 01001110b
+ vpshufd %%T3, %%HK, 01001110b
+ vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
+ vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ vpxor %%T2, %%T2, %%GH
+ vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
+ vpxor %%GH, %%GH, %%T3
+ vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
+
+ ;first phase of the reduction
+ vpslld %%T2, %%GH, 31 ; packed right shifting << 31
+ vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpxor %%T2, %%T2, %%T5
+ vpxor %%GH, %%GH, %%T2
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ cmp r13, rax
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 9
+vmovdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA+16*10]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA+16*0]
+ vpxor %%XMM1, %%T_key
+ vpxor %%XMM2, %%T_key
+ vpxor %%XMM3, %%T_key
+ vpxor %%XMM4, %%T_key
+ vpxor %%XMM5, %%T_key
+ vpxor %%XMM6, %%T_key
+ vpxor %%XMM7, %%T_key
+ vpxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 9 ; do 9 rounds
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; r11 is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%T1
+ vpxor %%XMM2, %%T1
+ vpxor %%XMM3, %%T1
+ vpxor %%XMM4, %%T1
+ vpxor %%XMM5, %%T1
+ vpxor %%XMM6, %%T1
+ vpxor %%XMM7, %%T1
+ vpxor %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+
+ vpshufd %%T6, %%T2, 01001110b
+ vpxor %%T6, %%T2
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
+
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpxor %%T6, %%T4
+ vpxor %%T6, %%T7
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+
+%assign i 0
+%assign j 1
+%rep 8
+ %ifidn %%ENC_DEC, ENC
+
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ vaesenclast reg(j), reg(j), %%T2
+
+ %else
+
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+
+ %endif
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T3
+ vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
+
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK]
+ vpshufb %%XMM3, [SHUF_MASK]
+ vpshufb %%XMM4, [SHUF_MASK]
+ vpshufb %%XMM5, [SHUF_MASK]
+ vpshufb %%XMM6, [SHUF_MASK]
+ vpshufb %%XMM7, [SHUF_MASK]
+ vpshufb %%XMM8, [SHUF_MASK]
+
+
+ vpxor %%XMM1, %%T6
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpxor %%T2, %%XMM1
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vmovdqu %%T3, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpxor %%T2, %%XMM2
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpxor %%T2, %%XMM3
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpxor %%T2, %%XMM4
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpxor %%T2, %%XMM5
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpxor %%T2, %%XMM6
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpxor %%T2, %%XMM7
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpxor %%T2, %%XMM8
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T4
+ vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+%endmacro
+
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep 9
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*10]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqu xmm2, [r10]
+ vmovdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ vmovdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ vpaddd xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm2
+ vmovdqu [%%GDATA + AadHash], xmm14
+
+ %else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm9
+ vmovdqu [%%GDATA + AadHash], xmm14
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen]
+ vmovdqu xmm14, [%%GDATA+AadHash]
+ vmovdqu xmm13, [%%GDATA+HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA+InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA+OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_precomp_avx_gen2
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_precomp_avx_gen2
+aesni_gcm128_precomp_avx_gen2:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, 1
+ vpsrlq xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, [POLY]
+ vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_init_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_init_avx_gen2
+aesni_gcm128_init_avx_gen2:
+
+ push r12
+ push r13
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_update_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_update_avx_gen2
+aesni_gcm128_enc_update_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_update_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_update_avx_gen2
+aesni_gcm128_dec_update_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_finalize_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_finalize_avx_gen2
+aesni_gcm128_enc_finalize_avx_gen2:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_finalize_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_finalize_avx_gen2
+aesni_gcm128_dec_finalize_avx_gen2:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_avx_gen2
+aesni_gcm128_enc_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_avx_gen2
+aesni_gcm128_dec_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm
new file mode 100644
index 000000000..c1acb4c87
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_avx_gen4.asm
@@ -0,0 +1,1990 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxor %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxor %%T1, %%T1, %%T3
+ vpxor %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4
+; functions, but are kept to allow users to switch cpu architectures between calls
+; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ;move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 9
+vmovdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA+16*10]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA+16*0]
+ vpxor %%XMM1, %%XMM1, %%T_key
+ vpxor %%XMM2, %%XMM2, %%T_key
+ vpxor %%XMM3, %%XMM3, %%T_key
+ vpxor %%XMM4, %%XMM4, %%T_key
+ vpxor %%XMM5, %%XMM5, %%T_key
+ vpxor %%XMM6, %%XMM6, %%T_key
+ vpxor %%XMM7, %%XMM7, %%T_key
+ vpxor %%XMM8, %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 9 ; do 9 rounds
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ vpxor %%XMM2, %%XMM2, %%T1
+ vpxor %%XMM3, %%XMM3, %%T1
+ vpxor %%XMM4, %%XMM4, %%T1
+ vpxor %%XMM5, %%XMM5, %%T1
+ vpxor %%XMM6, %%XMM6, %%T1
+ vpxor %%XMM7, %%XMM7, %%T1
+ vpxor %%XMM8, %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T1, %%T4, %%T3
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+
+%assign i 0
+%assign j 1
+%rep 8
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T7, %%T3
+ vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+
+ vpxor %%XMM1, %%T1
+
+
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM8
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep 9
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*10]
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqu xmm2, [r10]
+ vmovdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ vmovdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+
+ vpaddd xmm9, xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA + CurCount], xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm14, xmm2
+ vmovdqu [%%GDATA + AadHash], xmm14
+ %else
+ vpxor xmm9, xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm14, xmm9
+ vmovdqu [%%GDATA + AadHash], xmm14
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen]
+ vmovdqu xmm14, [%%GDATA+AadHash]
+ vmovdqu xmm13, [%%GDATA+HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA+InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_precomp_avx_gen4
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_precomp_avx_gen4
+aesni_gcm128_precomp_avx_gen4:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, xmm2, [POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_init_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_init_avx_gen4
+aesni_gcm128_init_avx_gen4:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_update_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_update_avx_gen4
+aesni_gcm128_enc_update_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_update_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_update_avx_gen4
+aesni_gcm128_dec_update_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_finalize_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_finalize_avx_gen4
+aesni_gcm128_enc_finalize_avx_gen4:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_finalize_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_finalize_avx_gen4
+aesni_gcm128_dec_finalize_avx_gen4:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_avx_gen4
+aesni_gcm128_enc_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_avx_gen4
+aesni_gcm128_dec_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm
new file mode 100644
index 000000000..a825b162a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm128_sse.asm
@@ -0,0 +1,2033 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqa %%T1, %%GH
+ pshufd %%T2, %%GH, 01001110b
+ pshufd %%T3, %%HK, 01001110b
+ pxor %%T2, %%GH ; %%T2 = (a1+a0)
+ pxor %%T3, %%HK ; %%T3 = (b1+b0)
+
+ pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T2, %%GH
+ pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%GH, %%T3
+ pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%GH
+ movdqa %%T3, %%GH
+ movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%GH
+ movdqa %%T4,%%GH
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T5
+ pxor %%GH, %%T2
+ pxor %%GH, %%T1 ; the result is in %%T1
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ movdqa %%T4, %%HK
+ pshufd %%T1, %%HK, 01001110b
+ pxor %%T1, %%HK
+ movdqu [%%GDATA + HashKey_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly
+ movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly
+ movdqu [%%GDATA + HashKey_3], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_3_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly
+ movdqu [%%GDATA + HashKey_4], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly
+ movdqu [%%GDATA + HashKey_5], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_5_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly
+ movdqu [%%GDATA + HashKey_6], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly
+ movdqu [%%GDATA + HashKey_7], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly
+ movdqu [%%GDATA + HashKey_8], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_8_k], %%T1
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ pxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ pinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ pinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ pxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ movdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ movdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ movdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ movdqa xmm3, xmm1
+ pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pand xmm3, xmm1
+ pshufb xmm3, [SHUF_MASK]
+ pshufb xmm3, xmm2
+ pxor %%AAD_HASH, xmm3
+
+
+ cmp r15, 0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ movdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pshufb xmm9, [SHUF_MASK]
+ pshufb xmm9, xmm2
+ pxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ movdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ pshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+
+ ; start AES for %%num_initial_blocks blocks
+ movdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa reg(i), %%CTR
+ pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+movdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ pxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 9
+movdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+movdqu %%T_key, [%%GDATA+16*10]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ pxor reg(i), %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(i), %%T1
+ %endif
+ pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ pxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ movdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM1, %%CTR
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM2, %%CTR
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM3, %%CTR
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM4, %%CTR
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM5, %%CTR
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM6, %%CTR
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM7, %%CTR
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM8, %%CTR
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu %%T_key, [%%GDATA+16*0]
+ pxor %%XMM1, %%T_key
+ pxor %%XMM2, %%T_key
+ pxor %%XMM3, %%T_key
+ pxor %%XMM4, %%T_key
+ pxor %%XMM5, %%T_key
+ pxor %%XMM6, %%T_key
+ pxor %%XMM7, %%T_key
+ pxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 9 ; do 9 rounds
+ movdqu %%T_key, [%%GDATA+16*i]
+ aesenc %%XMM1, %%T_key
+ aesenc %%XMM2, %%T_key
+ aesenc %%XMM3, %%T_key
+ aesenc %%XMM4, %%T_key
+ aesenc %%XMM5, %%T_key
+ aesenc %%XMM6, %%T_key
+ aesenc %%XMM7, %%T_key
+ aesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ movdqu %%T_key, [%%GDATA+16*i]
+ aesenclast %%XMM1, %%T_key
+ aesenclast %%XMM2, %%T_key
+ aesenclast %%XMM3, %%T_key
+ aesenclast %%XMM4, %%T_key
+ aesenclast %%XMM5, %%T_key
+ aesenclast %%XMM6, %%T_key
+ aesenclast %%XMM7, %%T_key
+ aesenclast %%XMM8, %%T_key
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ pxor %%XMM1, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM1, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ pxor %%XMM2, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM2, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ pxor %%XMM3, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM3, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ pxor %%XMM4, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM4, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ pxor %%XMM5, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM5, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ pxor %%XMM6, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM6, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ pxor %%XMM7, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM7, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ pxor %%XMM8, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ movdqa %%T7, %%XMM1
+ movdqu [rsp + TMP2], %%XMM2
+ movdqu [rsp + TMP3], %%XMM3
+ movdqu [rsp + TMP4], %%XMM4
+ movdqu [rsp + TMP5], %%XMM5
+ movdqu [rsp + TMP6], %%XMM6
+ movdqu [rsp + TMP7], %%XMM7
+ movdqu [rsp + TMP8], %%XMM8
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+
+ movdqa %%T4, %%T7
+ pshufd %%T6, %%T7, 01001110b
+ pxor %%T6, %%T7
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ %endif
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ movdqa %%XMM1, %%CTR
+
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+ %endif
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ movdqu %%T1, [%%GDATA + 16*0]
+ pxor %%XMM1, %%T1
+ pxor %%XMM2, %%T1
+ pxor %%XMM3, %%T1
+ pxor %%XMM4, %%T1
+ pxor %%XMM5, %%T1
+ pxor %%XMM6, %%T1
+ pxor %%XMM7, %%T1
+ pxor %%XMM8, %%T1
+
+ ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP2]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*1]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T1, [%%GDATA + 16*2]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqu %%T1, [rsp + TMP3]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*3]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP4]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*4]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*5]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP5]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+
+ movdqu %%T1, [%%GDATA + 16*6]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+ movdqu %%T1, [rsp + TMP6]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*7]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP7]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*8]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP8]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T7, %%T3
+ pxor %%T4, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*9]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T5, [%%GDATA + 16*10]
+
+%assign i 0
+%assign j 1
+%rep 8
+ XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%T3, %%T1
+ %endif
+
+ pxor %%T1, %%T5
+ aesenclast reg(j), %%T1 ; XMM1:XMM8
+ XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer
+
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(j), %%T3
+ %endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+
+
+ pxor %%T2, %%T6
+ pxor %%T2, %%T4
+ pxor %%T2, %%T7
+
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T3
+ pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7
+
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T1, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T1
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T1,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T1,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T1
+
+ pxor %%T2, %%T5
+ pxor %%T7, %%T2
+ pxor %%T7, %%T4 ; the result is in %%T4
+
+
+ pxor %%XMM1, %%T7
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ; Karatsuba Method
+ movdqa %%T6, %%XMM1
+ pshufd %%T2, %%XMM1, 01001110b
+ pxor %%T2, %%XMM1
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1
+
+ pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ movdqa %%T7, %%XMM1
+ movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM2
+ pshufd %%T2, %%XMM2, 01001110b
+ pxor %%T2, %%XMM2
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM2
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM3
+ pshufd %%T2, %%XMM3, 01001110b
+ pxor %%T2, %%XMM3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM3
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM4
+ pshufd %%T2, %%XMM4, 01001110b
+ pxor %%T2, %%XMM4
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM4
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM5
+ pshufd %%T2, %%XMM5, 01001110b
+ pxor %%T2, %%XMM5
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM5
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM6
+ pshufd %%T2, %%XMM6, 01001110b
+ pxor %%T2, %%XMM6
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM6
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM7
+ pshufd %%T2, %%XMM7, 01001110b
+ pxor %%T2, %%XMM7
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM7
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM8
+ pshufd %%T2, %%XMM8, 01001110b
+ pxor %%T2, %%XMM8
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM8
+ pxor %%T2, %%XMM1
+ pxor %%T2, %%T6
+ pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm
+
+
+ movdqa %%T4, %%T2
+ pslldq %%T4, 8 ; shift-L %%T4 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T4
+ pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T1, %%T2
+ psrldq %%T1, 4 ; shift-R %%T1 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T4,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T1
+ pxor %%T7, %%T2
+ pxor %%T6, %%T7 ; the result is in %%T6
+
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 3
+%define %%GDATA %1
+%define %%ST %2
+%define %%T1 %3
+ movdqu %%T1, [%%GDATA+16*0]
+ pxor %%ST, %%T1
+%assign i 1
+%rep 9
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenc %%ST, %%T1
+%assign i (i+1)
+%endrep
+ movdqu %%T1, [%%GDATA+16*10]
+ aesenclast %%ST, %%T1
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13, and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ movdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ pxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ movdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ movdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ movdqu xmm2, [r10]
+ movdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ pshufb xmm2, [SHUF_MASK]
+
+ movdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ movdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ movdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+ mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ;save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ movd r15d, xmm9
+ and r15d, 255
+ pshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ pshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ pshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ movdqu [%%GDATA + AadHash], xmm14
+ movdqu [%%GDATA + CurCount], xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ paddd xmm9, [ONE] ; INCR CNT to get Yn
+ movdqu [%%GDATA + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9
+ pshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9, xmm2 ; E(K, Yn)
+ movdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ movdqa xmm2, xmm1
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pand xmm2, xmm1
+ pshufb xmm2, [SHUF_MASK]
+ pxor xmm14, xmm2
+ movdqu [%%GDATA + AadHash], xmm14
+
+ %else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pshufb xmm9, [SHUF_MASK]
+ pxor xmm14, xmm9
+ movdqu [%%GDATA + AadHash], xmm14
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen] ; r12 = aadLen (number of bytes)
+ movdqu xmm14, [%%GDATA + AadHash]
+ movdqu xmm13, [%%GDATA + HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ movdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ movd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ movq xmm1, %%PLAIN_CYPH_LEN
+ pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ pxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ pxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu xmm9, [%%GDATA + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9, xmm2 ; E(K, Y0)
+
+ pxor xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ movq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ movq rax, xmm9
+ mov [r10], rax
+ psrldq xmm9, 8
+ movd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ movdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ;GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_precomp_sse
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_precomp_sse
+aesni_gcm128_precomp_sse:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ pxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey
+
+ pshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ movdqa xmm2, xmm6
+ psllq xmm6, 1
+ psrlq xmm2, 63
+ movdqa xmm1, xmm2
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ por xmm6, xmm2
+ ;reduction
+ pshufd xmm2, xmm1, 00100100b
+ pcmpeqd xmm2, [TWOONE]
+ pand xmm2, [POLY]
+ pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_init_sse(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_init_sse
+aesni_gcm128_init_sse:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_update_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_update_sse
+aesni_gcm128_enc_update_sse:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_update_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_update_sse
+aesni_gcm128_dec_update_sse:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_finalize_sse(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_finalize_sse
+aesni_gcm128_enc_finalize_sse:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp + 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_finalize_sse(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_finalize_sse
+aesni_gcm128_dec_finalize_sse:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp + 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_enc_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_enc_sse
+aesni_gcm128_enc_sse:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm128_dec_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm128_dec_sse
+aesni_gcm128_dec_sse:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm
new file mode 100644
index 000000000..2c10c916e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen2.asm
@@ -0,0 +1,2036 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba
+ vpshufd %%T2, %%GH, 01001110b
+ vpshufd %%T3, %%HK, 01001110b
+ vpxor %%T2, %%T2, %%GH ; %%T2 = (a1+a0)
+ vpxor %%T3, %%T3, %%HK ; %%T3 = (b1+b0)
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ vpclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ vpxor %%T2, %%T2, %%GH
+ vpxor %%T2, %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ vpslldq %%T3, %%T2, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T2, %%T2, 8 ; shift-R %%T2 2 DWs
+ vpxor %%GH, %%GH, %%T3
+ vpxor %%T1, %%T1, %%T2 ; <%%T1:%%GH> = %%GH x %%HK
+
+ ;first phase of the reduction
+ vpslld %%T2, %%GH, 31 ; packed right shifting << 31
+ vpslld %%T3, %%GH, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%GH, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T5, %%T2, 4 ; shift-R %%T5 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%GH,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%GH,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%GH,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpxor %%T2, %%T2, %%T5
+ vpxor %%GH, %%GH, %%T2
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ;move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 13 ; encrypt N blocks with 13 key rounds
+vmovdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA+16*j] ; encrypt with last (14th) key round
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA+16*0]
+ vpxor %%XMM1, %%T_key
+ vpxor %%XMM2, %%T_key
+ vpxor %%XMM3, %%T_key
+ vpxor %%XMM4, %%T_key
+ vpxor %%XMM5, %%T_key
+ vpxor %%XMM6, %%T_key
+ vpxor %%XMM7, %%T_key
+ vpxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 13 ; do early (13) rounds
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA+16*i] ; do final key round
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; r11 is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%T1
+ vpxor %%XMM2, %%T1
+ vpxor %%XMM3, %%T1
+ vpxor %%XMM4, %%T1
+ vpxor %%XMM5, %%T1
+ vpxor %%XMM6, %%T1
+ vpxor %%XMM7, %%T1
+ vpxor %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+
+ vpshufd %%T6, %%T2, 01001110b
+ vpxor %%T6, %%T2
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%T6, %%T6, %%T5, 0x00 ;
+
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpshufd %%T3, %%T1, 01001110b
+ vpxor %%T3, %%T1
+ vmovdqu %%T5, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T3, %%T3, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpxor %%T6, %%T4
+ vpxor %%T6, %%T7
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+
+
+%assign i 0
+%assign j 1
+%rep 8
+ %ifidn %%ENC_DEC, ENC
+
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ vaesenclast reg(j), reg(j), %%T2
+
+ %else
+
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+
+ %endif
+
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T3
+ vpxor %%T6, %%T4 ; accumulate the results in %%T6:%%T7
+
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK]
+ vpshufb %%XMM3, [SHUF_MASK]
+ vpshufb %%XMM4, [SHUF_MASK]
+ vpshufb %%XMM5, [SHUF_MASK]
+ vpshufb %%XMM6, [SHUF_MASK]
+ vpshufb %%XMM7, [SHUF_MASK]
+ vpshufb %%XMM8, [SHUF_MASK]
+
+
+ vpxor %%XMM1, %%T6
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpxor %%T2, %%XMM1
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vmovdqu %%T3, [%%GDATA + HashKey_8_k]
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpxor %%T2, %%XMM2
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_7_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpxor %%T2, %%XMM3
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_6_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpxor %%T2, %%XMM4
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_5_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpxor %%T2, %%XMM5
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_4_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpxor %%T2, %%XMM6
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_3_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpxor %%T2, %%XMM7
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_2_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpxor %%T2, %%XMM8
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vmovdqu %%T3, [%%GDATA + HashKey_k]
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T4
+ vpxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;first phase of the reduction
+
+ vpslld %%T2, %%T7, 31 ; packed right shifting << 31
+ vpslld %%T3, %%T7, 30 ; packed right shifting shift << 30
+ vpslld %%T4, %%T7, 25 ; packed right shifting shift << 25
+
+ vpxor %%T2, %%T2, %%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2, %%T4
+
+ vpsrldq %%T1, %%T2, 4 ; shift-R %%T1 1 DW
+
+ vpslldq %%T2, %%T2, 12 ; shift-L %%T2 3 DWs
+ vpxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+
+ vpsrld %%T2,%%T7,1 ; packed left shifting >> 1
+ vpsrld %%T3,%%T7,2 ; packed left shifting >> 2
+ vpsrld %%T4,%%T7,7 ; packed left shifting >> 7
+ vpxor %%T2, %%T2,%%T3 ; xor the shifted versions
+ vpxor %%T2, %%T2,%%T4
+
+ vpxor %%T2, %%T2, %%T1
+ vpxor %%T7, %%T7, %%T2
+ vpxor %%T6, %%T6, %%T7 ; the result is in %%T6
+
+
+%endmacro
+
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep 13 ; early key rounds (13)
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i] ; final key round (14)
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqu xmm2, [r10]
+ vmovdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ vmovdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN,%%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ vpaddd xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm2
+ vmovdqu [%%GDATA + AadHash], xmm14
+
+ %else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm9
+ vmovdqu [%%GDATA + AadHash], xmm14
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen]
+ vmovdqu xmm14, [%%GDATA+AadHash]
+ vmovdqu xmm13, [%%GDATA+HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA+InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA+OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm14
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_precomp_avx_gen2
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_precomp_avx_gen2
+aesni_gcm256_precomp_avx_gen2:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, 1
+ vpsrlq xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, [POLY]
+ vpxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_init_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_init_avx_gen2
+aesni_gcm256_init_avx_gen2:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_update_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_update_avx_gen2
+aesni_gcm256_enc_update_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_update_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_update_avx_gen2
+aesni_gcm256_dec_update_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_finalize_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_finalize_avx_gen2
+aesni_gcm256_enc_finalize_avx_gen2:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_finalize_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_finalize_avx_gen2
+aesni_gcm256_dec_finalize_avx_gen2:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_avx_gen2
+aesni_gcm256_enc_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_avx_gen2(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_avx_gen2
+aesni_gcm256_dec_avx_gen2:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm
new file mode 100644
index 000000000..022f73fa3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_avx_gen4.asm
@@ -0,0 +1,2030 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+; The details of the implementation is explained in:
+; Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on Intel Architecture Processors. October, 2012.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vpclmulqdq %%T1, %%GH, %%HK, 0x11 ; %%T1 = a1*b1
+ vpclmulqdq %%T2, %%GH, %%HK, 0x00 ; %%T2 = a0*b0
+ vpclmulqdq %%T3, %%GH, %%HK, 0x01 ; %%T3 = a1*b0
+ vpclmulqdq %%GH, %%GH, %%HK, 0x10 ; %%GH = a0*b1
+ vpxor %%GH, %%GH, %%T3
+
+
+ vpsrldq %%T3, %%GH, 8 ; shift-R %%GH 2 DWs
+ vpslldq %%GH, %%GH, 8 ; shift-L %%GH 2 DWs
+
+ vpxor %%T1, %%T1, %%T3
+ vpxor %%GH, %%GH, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%GH, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L %%T2 2 DWs
+
+ vpxor %%GH, %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%GH, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%GH, %%T3, %%GH, 0x10
+ vpslldq %%GH, %%GH, 4 ; shift-L %%GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%GH, %%GH, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%GH, %%GH, %%T1 ; the result is in %%GH
+
+
+%endmacro
+
+
+; In PRECOMPUTE, the commands filling Hashkey_i_k are not required for avx_gen4
+; functions, but are kept to allow users to switch cpu architectures between calls
+; of pre, init, update, and finalize.
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+ ; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vmovdqa %%T5, %%HK
+
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^2<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_2], %%T5 ; [HashKey_2] = HashKey^2<<1 mod poly
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^3<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_3], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_3_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^4<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_4], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^5<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_5], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_5_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^6<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_6], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^7<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_7], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T5, %%HK, %%T1, %%T3, %%T4, %%T6, %%T2 ; %%T5 = HashKey^8<<1 mod poly
+ vmovdqu [%%GDATA + HashKey_8], %%T5
+ vpshufd %%T1, %%T5, 01001110b
+ vpxor %%T1, %%T5
+ vmovdqu [%%GDATA + HashKey_8_k], %%T1
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ vpxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ vpinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ vpinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ vpinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ vpxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ vmovdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ vpshufb %%XTMP1, [SHUF_MASK]
+ vpxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ VXLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+
+%%_data_read: ;Finished reading in data
+
+
+ vmovdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ vmovdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ vmovdqa xmm3, xmm1
+ vpxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpand xmm3, xmm1
+ vpshufb xmm3, [SHUF_MASK]
+ vpshufb xmm3, xmm2
+ vpxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ vpxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ vpshufb xmm9, xmm2
+ vpxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ vpshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified.
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ;move AAD_HASH to temp reg
+ ; start AES for %%num_initial_blocks blocks
+ vmovdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa reg(i), %%CTR
+ vpshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+vmovdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vpxor reg(i),reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 13 ; encrypt N blocks with 13 key rounds
+vmovdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+vmovdqu %%T_key, [%%GDATA+16*j] ; encrypt with last (14th) key round
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ vaesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ vpxor reg(i), reg(i), %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa reg(i), %%T1
+ %endif
+ vpshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ vpxor reg(j), reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ vmovdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM1, %%CTR
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM2, %%CTR
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM3, %%CTR
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM4, %%CTR
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM5, %%CTR
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM6, %%CTR
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM7, %%CTR
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ vpaddd %%CTR, %%CTR, [ONE] ; INCR Y0
+ vmovdqa %%XMM8, %%CTR
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu %%T_key, [%%GDATA+16*0]
+ vpxor %%XMM1, %%XMM1, %%T_key
+ vpxor %%XMM2, %%XMM2, %%T_key
+ vpxor %%XMM3, %%XMM3, %%T_key
+ vpxor %%XMM4, %%XMM4, %%T_key
+ vpxor %%XMM5, %%XMM5, %%T_key
+ vpxor %%XMM6, %%XMM6, %%T_key
+ vpxor %%XMM7, %%XMM7, %%T_key
+ vpxor %%XMM8, %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 13 ; do early (13) rounds
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenc %%XMM1, %%T_key
+ vaesenc %%XMM2, %%T_key
+ vaesenc %%XMM3, %%T_key
+ vaesenc %%XMM4, %%T_key
+ vaesenc %%XMM5, %%T_key
+ vaesenc %%XMM6, %%T_key
+ vaesenc %%XMM7, %%T_key
+ vaesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ vmovdqu %%T_key, [%%GDATA+16*i]
+ vaesenclast %%XMM1, %%T_key
+ vaesenclast %%XMM2, %%T_key
+ vaesenclast %%XMM3, %%T_key
+ vaesenclast %%XMM4, %%T_key
+ vaesenclast %%XMM5, %%T_key
+ vaesenclast %%XMM6, %%T_key
+ vaesenclast %%XMM7, %%T_key
+ vaesenclast %%XMM8, %%T_key
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM1, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ vpxor %%XMM2, %%XMM2, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM2, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ vpxor %%XMM3, %%XMM3, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM3, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ vpxor %%XMM4, %%XMM4, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM4, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ vpxor %%XMM5, %%XMM5, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM5, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ vpxor %%XMM6, %%XMM6, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM6, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ vpxor %%XMM7, %%XMM7, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM7, %%T1
+ %endif
+
+ VXLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ vpxor %%XMM8, %%XMM8, %%T1
+ VXSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpxor %%XMM1, %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ vmovdqa %%T2, %%XMM1
+ vmovdqu [rsp + TMP2], %%XMM2
+ vmovdqu [rsp + TMP3], %%XMM3
+ vmovdqu [rsp + TMP4], %%XMM4
+ vmovdqu [rsp + TMP5], %%XMM5
+ vmovdqu [rsp + TMP6], %%XMM6
+ vmovdqu [rsp + TMP7], %%XMM7
+ vmovdqu [rsp + TMP8], %%XMM8
+
+%ifidn %%loop_idx, in_order
+ vpaddd %%XMM1, %%CTR, [ONE] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONE]
+ vpaddd %%XMM3, %%XMM2, [ONE]
+ vpaddd %%XMM4, %%XMM3, [ONE]
+ vpaddd %%XMM5, %%XMM4, [ONE]
+ vpaddd %%XMM6, %%XMM5, [ONE]
+ vpaddd %%XMM7, %%XMM6, [ONE]
+ vpaddd %%XMM8, %%XMM7, [ONE]
+ vmovdqa %%CTR, %%XMM8
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+%else
+ vpaddd %%XMM1, %%CTR, [ONEf] ; INCR CNT
+ vpaddd %%XMM2, %%XMM1, [ONEf]
+ vpaddd %%XMM3, %%XMM2, [ONEf]
+ vpaddd %%XMM4, %%XMM3, [ONEf]
+ vpaddd %%XMM5, %%XMM4, [ONEf]
+ vpaddd %%XMM6, %%XMM5, [ONEf]
+ vpaddd %%XMM7, %%XMM6, [ONEf]
+ vpaddd %%XMM8, %%XMM7, [ONEf]
+ vmovdqa %%CTR, %%XMM8
+%endif
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T1, [%%GDATA + 16*0]
+ vpxor %%XMM1, %%XMM1, %%T1
+ vpxor %%XMM2, %%XMM2, %%T1
+ vpxor %%XMM3, %%XMM3, %%T1
+ vpxor %%XMM4, %%XMM4, %%T1
+ vpxor %%XMM5, %%XMM5, %%T1
+ vpxor %%XMM6, %%XMM6, %%T1
+ vpxor %%XMM7, %%XMM7, %%T1
+ vpxor %%XMM8, %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+
+
+
+ vmovdqu %%T1, [%%GDATA + 16*1]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [%%GDATA + 16*2]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+ vpclmulqdq %%T4, %%T2, %%T5, 0x11 ; %%T4 = a1*b1
+ vpclmulqdq %%T7, %%T2, %%T5, 0x00 ; %%T7 = a0*b0
+ vpclmulqdq %%T6, %%T2, %%T5, 0x01 ; %%T6 = a1*b0
+ vpclmulqdq %%T5, %%T2, %%T5, 0x10 ; %%T5 = a0*b1
+ vpxor %%T6, %%T6, %%T5
+
+ vmovdqu %%T1, [%%GDATA + 16*3]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP2]
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*4]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu %%T1, [rsp + TMP3]
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*5]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+
+ vmovdqu %%T1, [rsp + TMP4]
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*6]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP5]
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*7]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP6]
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vmovdqu %%T1, [%%GDATA + 16*8]
+ vaesenc %%XMM1, %%T1
+ vaesenc %%XMM2, %%T1
+ vaesenc %%XMM3, %%T1
+ vaesenc %%XMM4, %%T1
+ vaesenc %%XMM5, %%T1
+ vaesenc %%XMM6, %%T1
+ vaesenc %%XMM7, %%T1
+ vaesenc %%XMM8, %%T1
+
+ vmovdqu %%T1, [rsp + TMP7]
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T4, %%T4, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + 16*9]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T1, [rsp + TMP8]
+ vmovdqu %%T5, [%%GDATA + HashKey]
+
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x01
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x10
+ vpxor %%T6, %%T6, %%T3
+
+ vpclmulqdq %%T3, %%T1, %%T5, 0x11
+ vpxor %%T1, %%T4, %%T3
+
+
+ vmovdqu %%T5, [%%GDATA + 16*10]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*11]
+
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*12]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*13]
+ vaesenc %%XMM1, %%T5
+ vaesenc %%XMM2, %%T5
+ vaesenc %%XMM3, %%T5
+ vaesenc %%XMM4, %%T5
+ vaesenc %%XMM5, %%T5
+ vaesenc %%XMM6, %%T5
+ vaesenc %%XMM7, %%T5
+ vaesenc %%XMM8, %%T5
+
+ vmovdqu %%T5, [%%GDATA + 16*14]
+
+%assign i 0
+%assign j 1
+%rep 8
+ %ifdef NT_LD
+ VXLDR %%T2, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ vpxor %%T2, %%T2, %%T5
+ %else
+ vpxor %%T2, %%T5, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+ %endif
+
+ %ifidn %%ENC_DEC, ENC
+ vaesenclast reg(j), reg(j), %%T2
+ %else
+ vaesenclast %%T3, reg(j), %%T2
+ vpxor reg(j), %%T2, %%T5
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], %%T3
+ %endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ vpslldq %%T3, %%T6, 8 ; shift-L %%T3 2 DWs
+ vpsrldq %%T6, %%T6, 8 ; shift-R %%T2 2 DWs
+ vpxor %%T7, %%T7, %%T3
+ vpxor %%T1, %%T1, %%T6 ; accumulate the results in %%T1:%%T7
+
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ %ifidn %%ENC_DEC, ENC
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*0], %%XMM1 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*1], %%XMM2 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*2], %%XMM3 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*3], %%XMM4 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*4], %%XMM5 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*5], %%XMM6 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*6], %%XMM7 ; Write to the Ciphertext buffer
+ VXSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*7], %%XMM8 ; Write to the Ciphertext buffer
+ %endif
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T1, %%T1, %%T4 ; the result is in %%T1
+
+ vpshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ vpshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+
+ vpxor %%XMM1, %%T1
+
+
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+ ;; Karatsuba Method
+
+ vmovdqu %%T5, [%%GDATA + HashKey_8]
+
+ vpshufd %%T2, %%XMM1, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM1
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T6, %%XMM1, %%T5, 0x11
+ vpclmulqdq %%T7, %%XMM1, %%T5, 0x00
+
+ vpclmulqdq %%XMM1, %%T2, %%T3, 0x00
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_7]
+ vpshufd %%T2, %%XMM2, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM2
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM2, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_6]
+ vpshufd %%T2, %%XMM3, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM3
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM3, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_5]
+ vpshufd %%T2, %%XMM4, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM4
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM4, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_4]
+ vpshufd %%T2, %%XMM5, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM5
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM5, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_3]
+ vpshufd %%T2, %%XMM6, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM6
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM6, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey_2]
+ vpshufd %%T2, %%XMM7, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM7
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM7, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+
+ ;;;;;;;;;;;;;;;;;;;;;;
+
+ vmovdqu %%T5, [%%GDATA + HashKey]
+ vpshufd %%T2, %%XMM8, 01001110b
+ vpshufd %%T3, %%T5, 01001110b
+ vpxor %%T2, %%T2, %%XMM8
+ vpxor %%T3, %%T3, %%T5
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x11
+ vpxor %%T6, %%T6, %%T4
+
+ vpclmulqdq %%T4, %%XMM8, %%T5, 0x00
+ vpxor %%T7, %%T7, %%T4
+
+ vpclmulqdq %%T2, %%T2, %%T3, 0x00
+
+ vpxor %%XMM1, %%XMM1, %%T2
+ vpxor %%XMM1, %%XMM1, %%T6
+ vpxor %%T2, %%XMM1, %%T7
+
+
+
+
+ vpslldq %%T4, %%T2, 8
+ vpsrldq %%T2, %%T2, 8
+
+ vpxor %%T7, %%T7, %%T4
+ vpxor %%T6, %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;first phase of the reduction
+ vmovdqu %%T3, [POLY2]
+
+ vpclmulqdq %%T2, %%T3, %%T7, 0x01
+ vpslldq %%T2, %%T2, 8 ; shift-L xmm2 2 DWs
+
+ vpxor %%T7, %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+ ;second phase of the reduction
+ vpclmulqdq %%T2, %%T3, %%T7, 0x00
+ vpsrldq %%T2, %%T2, 4 ; shift-R %%T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
+
+ vpclmulqdq %%T4, %%T3, %%T7, 0x10
+ vpslldq %%T4, %%T4, 4 ; shift-L %%T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
+
+ vpxor %%T4, %%T4, %%T2 ; second phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vpxor %%T6, %%T6, %%T4 ; the result is in %%T6
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 2
+%define %%GDATA %1
+%define %%XMM0 %2
+
+ vpxor %%XMM0, %%XMM0, [%%GDATA+16*0]
+%assign i 1
+%rep 13 ; early key rounds (13)
+ vaesenc %%XMM0, [%%GDATA+16*i]
+%assign i (i+1)
+%endrep
+ vaesenclast %%XMM0, [%%GDATA+16*i] ; final key round (14)
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ vmovdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ vmovdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ vmovdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ vmovdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ vmovdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ vmovdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ vmovdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ vmovdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ vmovdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ vmovdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ vmovdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ vmovdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ vmovdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ vmovdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ vmovdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ vmovdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ vmovdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ vmovdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ vmovdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ vpxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ vmovdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ vmovdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ vmovdqu xmm2, [r10]
+ vmovdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ vpshufb xmm2, [SHUF_MASK]
+
+ vmovdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ vmovdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ vmovdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+
+ mov r13, %%PLAIN_CYPH_LEN
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ; save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ vmovd r15d, xmm9
+ and r15d, 255
+ vpshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ vpshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ vpshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ vpshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ vmovdqu [%%GDATA + AadHash], xmm14 ; my_ctx_data.aad hash = xmm14
+ vmovdqu [%%GDATA + CurCount], xmm9 ; my_ctx_data.current_counter = xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx_data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+
+ vpaddd xmm9, xmm9, [ONE] ; INCR CNT to get Yn
+ vmovdqu [%%GDATA + CurCount], xmm9
+
+ vpshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Yn)
+ vmovdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ vmovdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+
+ vmovdqu xmm2, [r12] ; get the appropriate shuffle mask
+ vpshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ vmovdqa xmm2, xmm1
+ vpxor xmm9, xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpand xmm2, xmm2, xmm1
+ vpshufb xmm2, [SHUF_MASK]
+ vpxor xmm14, xmm14, xmm2
+ vmovdqu [%%GDATA + AadHash], xmm14
+ %else
+ vpxor xmm9, xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ vmovdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ vpand xmm9, xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ vpshufb xmm9, [SHUF_MASK]
+ vpxor xmm14, xmm14, xmm9
+ vmovdqu [%%GDATA + AadHash], xmm14
+ vpshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ vmovq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ vpsrldq xmm9, xmm9, 8
+ vmovq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen]
+ vmovdqu xmm14, [%%GDATA+AadHash]
+ vmovdqu xmm13, [%%GDATA+HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ vmovdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA+InLen]
+
+ shl r12, 3 ; convert into number of bits
+ vmovd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ vmovq xmm1, %%PLAIN_CYPH_LEN
+ vpslldq xmm15, xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ vpxor xmm15, xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ vpxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ vpshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ vmovdqu xmm9, [%%GDATA + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9 ; E(K, Y0)
+
+ vpxor xmm9, xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ vmovq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ vmovq rax, xmm9
+ mov [r10], rax
+ vpsrldq xmm9, xmm9, 8
+ vmovd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ vmovdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ; GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_precomp_avx_gen4
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_precomp_avx_gen4
+aesni_gcm256_precomp_avx_gen4:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ vmovdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ vpxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey
+
+ vpshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ vmovdqa xmm2, xmm6
+ vpsllq xmm6, xmm6, 1
+ vpsrlq xmm2, xmm2, 63
+ vmovdqa xmm1, xmm2
+ vpslldq xmm2, xmm2, 8
+ vpsrldq xmm1, xmm1, 8
+ vpor xmm6, xmm6, xmm2
+ ;reduction
+ vpshufd xmm2, xmm1, 00100100b
+ vpcmpeqd xmm2, [TWOONE]
+ vpand xmm2, xmm2, [POLY]
+ vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_init_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_init_avx_gen4
+aesni_gcm256_init_avx_gen4:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ vmovdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_update_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_update_avx_gen4
+aesni_gcm256_enc_update_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_update_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_update_avx_gen4
+aesni_gcm256_dec_update_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_finalize_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_finalize_avx_gen4
+aesni_gcm256_enc_finalize_avx_gen4:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_finalize_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_finalize_avx_gen4
+aesni_gcm256_dec_finalize_avx_gen4:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ vmovdqu [rsp + 0*16],xmm6
+ vmovdqu [rsp + 1*16],xmm9
+ vmovdqu [rsp + 2*16],xmm11
+ vmovdqu [rsp + 3*16],xmm14
+ vmovdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm15 , [rsp + 4*16]
+ vmovdqu xmm14 , [rsp + 3*16]
+ vmovdqu xmm11 , [rsp + 2*16]
+ vmovdqu xmm9 , [rsp + 1*16]
+ vmovdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_avx_gen4
+aesni_gcm256_enc_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_avx_gen4(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_avx_gen4
+aesni_gcm256_dec_avx_gen4:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm
new file mode 100644
index 000000000..ab49e0770
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm256_sse.asm
@@ -0,0 +1,2074 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+;
+;
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+;
+; Assumptions:
+;
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+;
+;
+; AAD:
+; AAD will be padded with 0 to the next 16byte multiple
+; for example, assume AAD is a u32 vector
+;
+; if AAD is 8 bytes:
+; AAD[3] = {A0, A1};
+; padded AAD in xmm register = {A1 A0 0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A1) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 32-bit Sequence Number (A0) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 32-bit Sequence Number
+;
+; if AAD is 12 bytes:
+; AAD[3] = {A0, A1, A2};
+; padded AAD in xmm register = {A2 A1 A0 0}
+;
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | SPI (A2) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 64-bit Extended Sequence Number {A1,A0} |
+; | |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x0 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; AAD Format with 64-bit Extended Sequence Number
+;
+;
+; aadLen:
+; Must be a multiple of 4 bytes and from the definition of the spec.
+; The code additionally supports any aadLen length.
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+; poly = x^128 + x^127 + x^126 + x^121 + 1
+; throughout the code, one tab and two tab indentations are used. one tab is for GHASH part, two tabs is for AES part.
+;
+
+%include "reg_sizes.asm"
+%include "gcm_defines.asm"
+
+default rel
+; need to push 4 registers into stack to maintain
+%define STACK_OFFSET 8*4
+
+%define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register)
+%define TMP3 16*1 ; Temporary storage for AES State 3
+%define TMP4 16*2 ; Temporary storage for AES State 4
+%define TMP5 16*3 ; Temporary storage for AES State 5
+%define TMP6 16*4 ; Temporary storage for AES State 6
+%define TMP7 16*5 ; Temporary storage for AES State 7
+%define TMP8 16*6 ; Temporary storage for AES State 8
+
+%define LOCAL_STORAGE 16*7
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define XMM_STORAGE 16*10
+%else
+ %define XMM_STORAGE 0
+%endif
+
+%define VARIABLE_OFFSET LOCAL_STORAGE + XMM_STORAGE
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Utility Macros
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+; Input: A and B (128-bits each, bit-reflected)
+; Output: C = A*B*x mod poly, (i.e. >>1 )
+; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GHASH_MUL 7
+%define %%GH %1 ; 16 Bytes
+%define %%HK %2 ; 16 Bytes
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+ ; %%GH, %%HK hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqa %%T1, %%GH
+ pshufd %%T2, %%GH, 01001110b
+ pshufd %%T3, %%HK, 01001110b
+ pxor %%T2, %%GH ; %%T2 = (a1+a0)
+ pxor %%T3, %%HK ; %%T3 = (b1+b0)
+
+ pclmulqdq %%T1, %%HK, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%GH, %%HK, 0x00 ; %%GH = a0*b0
+ pclmulqdq %%T2, %%T3, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T2, %%GH
+ pxor %%T2, %%T1 ; %%T2 = a0*b1+a1*b0
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%GH, %%T3
+ pxor %%T1, %%T2 ; <%%T1:%%GH> holds the result of the carry-less multiplication of %%GH by %%HK
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%GH
+ movdqa %%T3, %%GH
+ movdqa %%T4, %%GH ; move %%GH into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%GH, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%GH ; make 3 copies of %%GH (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%GH
+ movdqa %%T4,%%GH
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T5
+ pxor %%GH, %%T2
+ pxor %%GH, %%T1 ; the result is in %%T1
+
+
+%endmacro
+
+
+%macro PRECOMPUTE 8
+%define %%GDATA %1
+%define %%HK %2
+%define %%T1 %3
+%define %%T2 %4
+%define %%T3 %5
+%define %%T4 %6
+%define %%T5 %7
+%define %%T6 %8
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ movdqa %%T4, %%HK
+ pshufd %%T1, %%HK, 01001110b
+ pxor %%T1, %%HK
+ movdqu [%%GDATA + HashKey_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^2<<1 mod poly
+ movdqu [%%GDATA + HashKey_2], %%T4 ; [HashKey_2] = HashKey^2<<1 mod poly
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_2_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^3<<1 mod poly
+ movdqu [%%GDATA + HashKey_3], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_3_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^4<<1 mod poly
+ movdqu [%%GDATA + HashKey_4], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_4_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^5<<1 mod poly
+ movdqu [%%GDATA + HashKey_5], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_5_k], %%T1
+
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^6<<1 mod poly
+ movdqu [%%GDATA + HashKey_6], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_6_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^7<<1 mod poly
+ movdqu [%%GDATA + HashKey_7], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_7_k], %%T1
+
+ GHASH_MUL %%T4, %%HK, %%T1, %%T2, %%T3, %%T5, %%T6 ; %%T4 = HashKey^8<<1 mod poly
+ movdqu [%%GDATA + HashKey_8], %%T4
+ pshufd %%T1, %%T4, 01001110b
+ pxor %%T1, %%T4
+ movdqu [%%GDATA + HashKey_8_k], %%T1
+
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; READ_SMALL_DATA_INPUT: Packs xmm register with data when data input is less than 16 bytes.
+; Returns 0 if data has length 0.
+; Input: The input data (INPUT), that data's length (LENGTH).
+; Output: The packed xmm register (OUTPUT).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro READ_SMALL_DATA_INPUT 6
+%define %%OUTPUT %1 ; %%OUTPUT is an xmm register
+%define %%INPUT %2
+%define %%LENGTH %3
+%define %%END_READ_LOCATION %4 ; All this and the lower inputs are temp registers
+%define %%COUNTER %5
+%define %%TMP1 %6
+
+ pxor %%OUTPUT, %%OUTPUT
+ mov %%COUNTER, %%LENGTH
+ mov %%END_READ_LOCATION, %%INPUT
+ add %%END_READ_LOCATION, %%LENGTH
+ xor %%TMP1, %%TMP1
+
+
+ cmp %%COUNTER, 8
+ jl %%_byte_loop_2
+ pinsrq %%OUTPUT, [%%INPUT],0 ;Read in 8 bytes if they exists
+ je %%_done
+
+ sub %%COUNTER, 8
+
+%%_byte_loop_1: ;Read in data 1 byte at a time while data is left
+ shl %%TMP1, 8 ;This loop handles when 8 bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_1
+ pinsrq %%OUTPUT, %%TMP1, 1
+ jmp %%_done
+
+%%_byte_loop_2: ;Read in data 1 byte at a time while data is left
+ cmp %%COUNTER, 0
+ je %%_done
+ shl %%TMP1, 8 ;This loop handles when no bytes were already read in
+ dec %%END_READ_LOCATION
+ mov BYTE(%%TMP1), BYTE [%%END_READ_LOCATION]
+ dec %%COUNTER
+ jg %%_byte_loop_2
+ pinsrq %%OUTPUT, %%TMP1, 0
+%%_done:
+
+%endmacro ; READ_SMALL_DATA_INPUT
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
+; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
+; Output: The hash of the data (AAD_HASH).
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro CALC_AAD_HASH 14
+%define %%A_IN %1
+%define %%A_LEN %2
+%define %%AAD_HASH %3
+%define %%HASH_KEY %4
+%define %%XTMP1 %5 ; xmm temp reg 5
+%define %%XTMP2 %6
+%define %%XTMP3 %7
+%define %%XTMP4 %8
+%define %%XTMP5 %9 ; xmm temp reg 5
+%define %%T1 %10 ; temp reg 1
+%define %%T2 %11
+%define %%T3 %12
+%define %%T4 %13
+%define %%T5 %14 ; temp reg 5
+
+
+ mov %%T1, %%A_IN ; T1 = AAD
+ mov %%T2, %%A_LEN ; T2 = aadLen
+ pxor %%AAD_HASH, %%AAD_HASH
+
+ cmp %%T2, 16
+ jl %%_get_small_AAD_block
+
+%%_get_AAD_loop16:
+
+ movdqu %%XTMP1, [%%T1]
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+ sub %%T2, 16
+ je %%_CALC_AAD_done
+
+ add %%T1, 16
+ cmp %%T2, 16
+ jge %%_get_AAD_loop16
+
+%%_get_small_AAD_block:
+ READ_SMALL_DATA_INPUT %%XTMP1, %%T1, %%T2, %%T3, %%T4, %%T5
+ ;byte-reflect the AAD data
+ pshufb %%XTMP1, [SHUF_MASK]
+ pxor %%AAD_HASH, %%XTMP1
+ GHASH_MUL %%AAD_HASH, %%HASH_KEY, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5
+
+%%_CALC_AAD_done:
+
+%endmacro ; CALC_AAD_HASH
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls.
+; Requires the input data be at least 1 byte long.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; the current data offset (DATA_OFFSET), and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the first partial block (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11, xmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro PARTIAL_BLOCK 7
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%DATA_OFFSET %5
+%define %%AAD_HASH %6
+%define %%ENC_DEC %7
+ mov r13, [%%GDATA + PBlockLen]
+ cmp r13, 0
+ je %%_partial_block_done ;Leave Macro if no partial blocks
+
+ cmp %%PLAIN_CYPH_LEN, 16 ;Read in input data without over reading
+ jl %%_fewer_than_16_bytes
+ XLDR xmm1, [%%PLAIN_CYPH_IN] ;If more than 16 bytes of data, just fill the xmm register
+ jmp %%_data_read
+
+%%_fewer_than_16_bytes:
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, %%PLAIN_CYPH_LEN, rax, r12, r15
+ mov r13, [%%GDATA + PBlockLen]
+
+%%_data_read: ;Finished reading in data
+
+
+ movdqu xmm9, [%%GDATA + PBlockEncKey] ;xmm9 = my_ctx_data.partial_block_enc_key
+ movdqu xmm13, [%%GDATA + HashKey]
+
+ lea r12, [SHIFT_MASK]
+
+ add r12, r13 ; adjust the shuffle mask pointer to be able to shift r13 bytes (16-r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm9, xmm2 ;shift right r13 bytes
+
+%ifidn %%ENC_DEC, DEC
+ movdqa xmm3, xmm1
+ pxor xmm9, xmm1 ; Cyphertext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_1 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_1:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pand xmm3, xmm1
+ pshufb xmm3, [SHUF_MASK]
+ pshufb xmm3, xmm2
+ pxor %%AAD_HASH, xmm3
+
+
+ cmp r15,0
+ jl %%_partial_incomplete_1
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_dec_done
+%%_partial_incomplete_1:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_dec_done:
+ movdqu [%%GDATA + AadHash], %%AAD_HASH
+
+%else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+
+ mov r15, %%PLAIN_CYPH_LEN
+ add r15, r13
+ sub r15, 16 ;Set r15 to be the amount of data left in CYPH_PLAIN_IN after filling the block
+ jge %%_no_extra_mask_2 ;Determine if if partial block is not being filled and shift mask accordingly
+ sub r12, r15
+%%_no_extra_mask_2:
+
+ movdqu xmm1, [r12 + ALL_F-SHIFT_MASK] ; get the appropriate mask to mask out bottom r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out bottom r13 bytes of xmm9
+
+ pshufb xmm9, [SHUF_MASK]
+ pshufb xmm9, xmm2
+ pxor %%AAD_HASH, xmm9
+
+ cmp r15,0
+ jl %%_partial_incomplete_2
+
+ GHASH_MUL %%AAD_HASH, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ xor rax,rax
+ mov [%%GDATA+PBlockLen], rax
+ jmp %%_encode_done
+%%_partial_incomplete_2:
+ add [%%GDATA+PBlockLen], %%PLAIN_CYPH_LEN
+%%_encode_done:
+ movdqu [%%GDATA + AadHash], %%AAD_HASH
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ pshufb xmm9, xmm2
+%endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output encrypted Bytes
+ cmp r15,0
+ jl %%_partial_fill
+ mov r12, r13
+ mov r13, 16
+ sub r13, r12 ; Set r13 to be the number of bytes to write out
+ jmp %%_count_set
+%%_partial_fill:
+ mov r13, %%PLAIN_CYPH_LEN
+%%_count_set:
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT+ %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%%_partial_block_done:
+%endmacro ; PARTIAL_BLOCK
+
+
+; if a = number of total plaintext bytes
+; b = floor(a/16)
+; %%num_initial_blocks = b mod 8;
+; encrypt the initial %%num_initial_blocks blocks and apply ghash on the ciphertext
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r14 are used as a pointer only, not modified
+; Updated AAD_HASH is returned in %%T3
+
+%macro INITIAL_BLOCKS 23
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%LENGTH %4
+%define %%DATA_OFFSET %5
+%define %%num_initial_blocks %6 ; can be 0, 1, 2, 3, 4, 5, 6 or 7
+%define %%T1 %7
+%define %%HASH_KEY %8
+%define %%T3 %9
+%define %%T4 %10
+%define %%T5 %11
+%define %%CTR %12
+%define %%XMM1 %13
+%define %%XMM2 %14
+%define %%XMM3 %15
+%define %%XMM4 %16
+%define %%XMM5 %17
+%define %%XMM6 %18
+%define %%XMM7 %19
+%define %%XMM8 %20
+%define %%T6 %21
+%define %%T_key %22
+%define %%ENC_DEC %23
+
+%assign i (8-%%num_initial_blocks)
+ movdqu reg(i), %%XMM8 ; move AAD_HASH to temp reg
+
+ ; start AES for %%num_initial_blocks blocks
+ movdqu %%CTR, [%%GDATA + CurCount] ; %%CTR = Y0
+
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa reg(i), %%CTR
+ pshufb reg(i), [SHUF_MASK] ; perform a 16Byte swap
+%assign i (i+1)
+%endrep
+
+movdqu %%T_key, [%%GDATA+16*0]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ pxor reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j 1
+%rep 13 ; encrypt N blocks with 13 key rounds
+movdqu %%T_key, [%%GDATA+16*j]
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenc reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign j (j+1)
+%endrep
+
+
+movdqu %%T_key, [%%GDATA+16*j] ; encrypt with last (14th) key round
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ aesenclast reg(i),%%T_key
+%assign i (i+1)
+%endrep
+
+%assign i (9-%%num_initial_blocks)
+%rep %%num_initial_blocks
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ pxor reg(i), %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], reg(i) ; write back ciphertext for %%num_initial_blocks blocks
+ add %%DATA_OFFSET, 16
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(i), %%T1
+ %endif
+ pshufb reg(i), [SHUF_MASK] ; prepare ciphertext for GHASH computations
+%assign i (i+1)
+%endrep
+
+
+%assign i (8-%%num_initial_blocks)
+%assign j (9-%%num_initial_blocks)
+
+%rep %%num_initial_blocks
+ pxor reg(j), reg(i)
+ GHASH_MUL reg(j), %%HASH_KEY, %%T1, %%T3, %%T4, %%T5, %%T6 ; apply GHASH on %%num_initial_blocks blocks
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+ ; %%XMM8 has the current Hash Value
+ movdqa %%T3, %%XMM8
+
+ cmp %%LENGTH, 128
+ jl %%_initial_blocks_done ; no need for precomputed constants
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM1, %%CTR
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM2, %%CTR
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM3, %%CTR
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM4, %%CTR
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM5, %%CTR
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM6, %%CTR
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM7, %%CTR
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+
+ paddd %%CTR, [ONE] ; INCR Y0
+ movdqa %%XMM8, %%CTR
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu %%T_key, [%%GDATA+16*0]
+ pxor %%XMM1, %%T_key
+ pxor %%XMM2, %%T_key
+ pxor %%XMM3, %%T_key
+ pxor %%XMM4, %%T_key
+ pxor %%XMM5, %%T_key
+ pxor %%XMM6, %%T_key
+ pxor %%XMM7, %%T_key
+ pxor %%XMM8, %%T_key
+
+
+%assign i 1
+%rep 13 ; do early (13) rounds
+ movdqu %%T_key, [%%GDATA+16*i]
+ aesenc %%XMM1, %%T_key
+ aesenc %%XMM2, %%T_key
+ aesenc %%XMM3, %%T_key
+ aesenc %%XMM4, %%T_key
+ aesenc %%XMM5, %%T_key
+ aesenc %%XMM6, %%T_key
+ aesenc %%XMM7, %%T_key
+ aesenc %%XMM8, %%T_key
+%assign i (i+1)
+%endrep
+
+
+ movdqu %%T_key, [%%GDATA+16*i] ; do final key round
+ aesenclast %%XMM1, %%T_key
+ aesenclast %%XMM2, %%T_key
+ aesenclast %%XMM3, %%T_key
+ aesenclast %%XMM4, %%T_key
+ aesenclast %%XMM5, %%T_key
+ aesenclast %%XMM6, %%T_key
+ aesenclast %%XMM7, %%T_key
+ aesenclast %%XMM8, %%T_key
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*0]
+ pxor %%XMM1, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*0], %%XMM1
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM1, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*1]
+ pxor %%XMM2, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*1], %%XMM2
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM2, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*2]
+ pxor %%XMM3, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*2], %%XMM3
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM3, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*3]
+ pxor %%XMM4, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*3], %%XMM4
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM4, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*4]
+ pxor %%XMM5, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*4], %%XMM5
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM5, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*5]
+ pxor %%XMM6, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*5], %%XMM6
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM6, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*6]
+ pxor %%XMM7, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*6], %%XMM7
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM7, %%T1
+ %endif
+
+ XLDR %%T1, [%%PLAIN_CYPH_IN + %%DATA_OFFSET + 16*7]
+ pxor %%XMM8, %%T1
+ XSTR [%%CYPH_PLAIN_OUT + %%DATA_OFFSET + 16*7], %%XMM8
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%XMM8, %%T1
+ %endif
+
+ add %%DATA_OFFSET, 128
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pxor %%XMM1, %%T3 ; combine GHASHed value with the corresponding ciphertext
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_initial_blocks_done:
+
+
+%endmacro
+
+
+
+; encrypt 8 blocks at a time
+; ghash the 8 previously encrypted ciphertext blocks
+; %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN are used as pointers only, not modified
+; %%DATA_OFFSET is the data offset value
+%macro GHASH_8_ENCRYPT_8_PARALLEL 22
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%DATA_OFFSET %4
+%define %%T1 %5
+%define %%T2 %6
+%define %%T3 %7
+%define %%T4 %8
+%define %%T5 %9
+%define %%T6 %10
+%define %%CTR %11
+%define %%XMM1 %12
+%define %%XMM2 %13
+%define %%XMM3 %14
+%define %%XMM4 %15
+%define %%XMM5 %16
+%define %%XMM6 %17
+%define %%XMM7 %18
+%define %%XMM8 %19
+%define %%T7 %20
+%define %%loop_idx %21
+%define %%ENC_DEC %22
+
+ movdqa %%T7, %%XMM1
+ movdqu [rsp + TMP2], %%XMM2
+ movdqu [rsp + TMP3], %%XMM3
+ movdqu [rsp + TMP4], %%XMM4
+ movdqu [rsp + TMP5], %%XMM5
+ movdqu [rsp + TMP6], %%XMM6
+ movdqu [rsp + TMP7], %%XMM7
+ movdqu [rsp + TMP8], %%XMM8
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+
+ movdqa %%T4, %%T7
+ pshufd %%T6, %%T7, 01001110b
+ pxor %%T6, %%T7
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ %endif
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T4, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T7, %%T5, 0x00 ; %%T7 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T6, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ movdqa %%XMM1, %%CTR
+
+ %ifidn %%loop_idx, in_order
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONE] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+ %else
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM2, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM3, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM4, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM5, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM6, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM7, %%CTR
+
+ paddd %%CTR, [ONEf] ; INCR CNT
+ movdqa %%XMM8, %%CTR
+ %endif
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ movdqu %%T1, [%%GDATA + 16*0]
+ pxor %%XMM1, %%T1
+ pxor %%XMM2, %%T1
+ pxor %%XMM3, %%T1
+ pxor %%XMM4, %%T1
+ pxor %%XMM5, %%T1
+ pxor %%XMM6, %%T1
+ pxor %%XMM7, %%T1
+ pxor %%XMM8, %%T1
+
+ ;; %%XMM6, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP2]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*1]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T1, [%%GDATA + 16*2]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; Karatsuba Method
+ movdqu %%T1, [rsp + TMP3]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*3]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP4]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*4]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*5]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP5]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+
+ movdqu %%T1, [%%GDATA + 16*6]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+ movdqu %%T1, [rsp + TMP6]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*7]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [rsp + TMP7]
+ movdqa %%T3, %%T1
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T4, %%T1 ; accumulate the results in %%T4:%%T7, %%T6 holds the middle part
+ pxor %%T7, %%T3
+ pxor %%T6, %%T2
+
+ movdqu %%T1, [%%GDATA + 16*8]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ ;; %%XMM8, %%T5 hold the values for the two operands which are carry-less multiplied
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ;; Karatsuba Method
+ movdqu %%T1, [rsp + TMP8]
+ movdqa %%T3, %%T1
+
+ pshufd %%T2, %%T3, 01001110b
+ pxor %%T2, %%T3
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+ pclmulqdq %%T3, %%T5, 0x00 ; %%T3 = a0*b0
+ movdqu %%T5, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T5, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+ pxor %%T7, %%T3
+ pxor %%T4, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*9]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+
+ movdqu %%T1, [%%GDATA + 16*10]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*11]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*12]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T1, [%%GDATA + 16*13]
+ aesenc %%XMM1, %%T1
+ aesenc %%XMM2, %%T1
+ aesenc %%XMM3, %%T1
+ aesenc %%XMM4, %%T1
+ aesenc %%XMM5, %%T1
+ aesenc %%XMM6, %%T1
+ aesenc %%XMM7, %%T1
+ aesenc %%XMM8, %%T1
+
+ movdqu %%T5, [%%GDATA + 16*14] ; finish last key round
+
+
+%assign i 0
+%assign j 1
+%rep 8
+ XLDR %%T1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET+16*i]
+
+ %ifidn %%ENC_DEC, DEC
+ movdqa %%T3, %%T1
+ %endif
+
+ pxor %%T1, %%T5
+ aesenclast reg(j), %%T1 ; XMM1:XMM8
+ XSTR [%%CYPH_PLAIN_OUT+%%DATA_OFFSET+16*i], reg(j) ; Write to the Output buffer
+
+ %ifidn %%ENC_DEC, DEC
+ movdqa reg(j), %%T3
+ %endif
+%assign i (i+1)
+%assign j (j+1)
+%endrep
+
+
+
+
+ pxor %%T2, %%T6
+ pxor %%T2, %%T4
+ pxor %%T2, %%T7
+
+
+ movdqa %%T3, %%T2
+ pslldq %%T3, 8 ; shift-L %%T3 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T3
+ pxor %%T4, %%T2 ; accumulate the results in %%T4:%%T7
+
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T1, %%T7 ; move %%T7 into %%T2, %%T3, %%T1 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T1, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T1
+
+ movdqa %%T5, %%T2
+ psrldq %%T5, 4 ; shift-R %%T5 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ pshufb %%XMM1, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM2, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM3, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM4, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM5, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM6, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM7, [SHUF_MASK] ; perform a 16Byte swap
+ pshufb %%XMM8, [SHUF_MASK] ; perform a 16Byte swap
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T1) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T1,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T1,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T1
+
+ pxor %%T2, %%T5
+ pxor %%T7, %%T2
+ pxor %%T7, %%T4 ; the result is in %%T4
+
+
+ pxor %%XMM1, %%T7
+
+%endmacro
+
+
+; GHASH the last 4 ciphertext blocks.
+%macro GHASH_LAST_8 16
+%define %%GDATA %1
+%define %%T1 %2
+%define %%T2 %3
+%define %%T3 %4
+%define %%T4 %5
+%define %%T5 %6
+%define %%T6 %7
+%define %%T7 %8
+%define %%XMM1 %9
+%define %%XMM2 %10
+%define %%XMM3 %11
+%define %%XMM4 %12
+%define %%XMM5 %13
+%define %%XMM6 %14
+%define %%XMM7 %15
+%define %%XMM8 %16
+
+ ; Karatsuba Method
+ movdqa %%T6, %%XMM1
+ pshufd %%T2, %%XMM1, 01001110b
+ pxor %%T2, %%XMM1
+ movdqu %%T5, [%%GDATA + HashKey_8]
+ pclmulqdq %%T6, %%T5, 0x11 ; %%T6 = a1*b1
+
+ pclmulqdq %%XMM1, %%T5, 0x00 ; %%XMM1 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_8_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ movdqa %%T7, %%XMM1
+ movdqa %%XMM1, %%T2 ; result in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM2
+ pshufd %%T2, %%XMM2, 01001110b
+ pxor %%T2, %%XMM2
+ movdqu %%T5, [%%GDATA + HashKey_7]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM2, %%T5, 0x00 ; %%XMM2 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_7_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM2
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM3
+ pshufd %%T2, %%XMM3, 01001110b
+ pxor %%T2, %%XMM3
+ movdqu %%T5, [%%GDATA + HashKey_6]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM3, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_6_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM3
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM4
+ pshufd %%T2, %%XMM4, 01001110b
+ pxor %%T2, %%XMM4
+ movdqu %%T5, [%%GDATA + HashKey_5]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM4, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_5_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM4
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM5
+ pshufd %%T2, %%XMM5, 01001110b
+ pxor %%T2, %%XMM5
+ movdqu %%T5, [%%GDATA + HashKey_4]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM5, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_4_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM5
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM6
+ pshufd %%T2, %%XMM6, 01001110b
+ pxor %%T2, %%XMM6
+ movdqu %%T5, [%%GDATA + HashKey_3]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM6, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_3_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM6
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM7
+ pshufd %%T2, %%XMM7, 01001110b
+ pxor %%T2, %%XMM7
+ movdqu %%T5, [%%GDATA + HashKey_2]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM7, %%T5, 0x00 ; %%XMM3 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_2_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM7
+ pxor %%XMM1, %%T2 ; results accumulated in %%T6, %%T7, %%XMM1
+
+
+ ; Karatsuba Method
+ movdqa %%T1, %%XMM8
+ pshufd %%T2, %%XMM8, 01001110b
+ pxor %%T2, %%XMM8
+ movdqu %%T5, [%%GDATA + HashKey]
+ pclmulqdq %%T1, %%T5, 0x11 ; %%T1 = a1*b1
+
+ pclmulqdq %%XMM8, %%T5, 0x00 ; %%XMM4 = a0*b0
+ movdqu %%T4, [%%GDATA + HashKey_k]
+ pclmulqdq %%T2, %%T4, 0x00 ; %%T2 = (a1+a0)*(b1+b0)
+
+ pxor %%T6, %%T1
+ pxor %%T7, %%XMM8
+ pxor %%T2, %%XMM1
+ pxor %%T2, %%T6
+ pxor %%T2, %%T7 ; middle section of the temp results combined as in Karatsuba algorithm
+
+
+ movdqa %%T4, %%T2
+ pslldq %%T4, 8 ; shift-L %%T4 2 DWs
+ psrldq %%T2, 8 ; shift-R %%T2 2 DWs
+ pxor %%T7, %%T4
+ pxor %%T6, %%T2 ; <%%T6:%%T7> holds the result of the accumulated carry-less multiplications
+
+
+ ;first phase of the reduction
+ movdqa %%T2, %%T7
+ movdqa %%T3, %%T7
+ movdqa %%T4, %%T7 ; move %%T7 into %%T2, %%T3, %%T4 in order to perform the three shifts independently
+
+ pslld %%T2, 31 ; packed right shifting << 31
+ pslld %%T3, 30 ; packed right shifting shift << 30
+ pslld %%T4, 25 ; packed right shifting shift << 25
+ pxor %%T2, %%T3 ; xor the shifted versions
+ pxor %%T2, %%T4
+
+ movdqa %%T1, %%T2
+ psrldq %%T1, 4 ; shift-R %%T1 1 DW
+
+ pslldq %%T2, 12 ; shift-L %%T2 3 DWs
+ pxor %%T7, %%T2 ; first phase of the reduction complete
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+ ;second phase of the reduction
+ movdqa %%T2,%%T7 ; make 3 copies of %%T7 (in in %%T2, %%T3, %%T4) for doing three shift operations
+ movdqa %%T3,%%T7
+ movdqa %%T4,%%T7
+
+ psrld %%T2,1 ; packed left shifting >> 1
+ psrld %%T3,2 ; packed left shifting >> 2
+ psrld %%T4,7 ; packed left shifting >> 7
+ pxor %%T2,%%T3 ; xor the shifted versions
+ pxor %%T2,%%T4
+
+ pxor %%T2, %%T1
+ pxor %%T7, %%T2
+ pxor %%T6, %%T7 ; the result is in %%T6
+
+%endmacro
+
+; Encryption of a single block
+%macro ENCRYPT_SINGLE_BLOCK 3
+%define %%GDATA %1
+%define %%ST %2
+%define %%T1 %3
+ movdqu %%T1, [%%GDATA+16*0]
+ pxor %%ST, %%T1
+%assign i 1
+%rep 13
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenc %%ST, %%T1
+%assign i (i+1)
+%endrep
+ movdqu %%T1, [%%GDATA+16*i]
+ aesenclast %%ST, %%T1
+%endmacro
+
+
+;; Start of Stack Setup
+
+%macro FUNC_SAVE 0
+ ;; Required for Update/GMC_ENC
+ ;the number of pushes must equal STACK_OFFSET
+ push r12
+ push r13
+ push r14
+ push r15
+ mov r14, rsp
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+ movdqu [rsp + LOCAL_STORAGE + 1*16],xmm7
+ movdqu [rsp + LOCAL_STORAGE + 2*16],xmm8
+ movdqu [rsp + LOCAL_STORAGE + 3*16],xmm9
+ movdqu [rsp + LOCAL_STORAGE + 4*16],xmm10
+ movdqu [rsp + LOCAL_STORAGE + 5*16],xmm11
+ movdqu [rsp + LOCAL_STORAGE + 6*16],xmm12
+ movdqu [rsp + LOCAL_STORAGE + 7*16],xmm13
+ movdqu [rsp + LOCAL_STORAGE + 8*16],xmm14
+ movdqu [rsp + LOCAL_STORAGE + 9*16],xmm15
+%endif
+%endmacro
+
+
+%macro FUNC_RESTORE 0
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + LOCAL_STORAGE + 9*16]
+ movdqu xmm14 , [rsp + LOCAL_STORAGE + 8*16]
+ movdqu xmm13 , [rsp + LOCAL_STORAGE + 7*16]
+ movdqu xmm12 , [rsp + LOCAL_STORAGE + 6*16]
+ movdqu xmm11 , [rsp + LOCAL_STORAGE + 5*16]
+ movdqu xmm10 , [rsp + LOCAL_STORAGE + 4*16]
+ movdqu xmm9 , [rsp + LOCAL_STORAGE + 3*16]
+ movdqu xmm8 , [rsp + LOCAL_STORAGE + 2*16]
+ movdqu xmm7 , [rsp + LOCAL_STORAGE + 1*16]
+ movdqu xmm6 , [rsp + LOCAL_STORAGE + 0*16]
+%endif
+
+;; Required for Update/GMC_ENC
+ mov rsp, r14
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_INIT initializes a gcm_data struct to prepare for encoding/decoding.
+; Input: gcm_data struct* (GDATA), IV, Additional Authentication data (A_IN), Additional
+; Data length (A_LEN)
+; Output: Updated GDATA with the hash of A_IN (AadHash) and initialized other parts of GDATA.
+; Clobbers rax, r10-r13 and xmm0-xmm6
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_INIT 4
+%define %%GDATA %1
+%define %%IV %2
+%define %%A_IN %3
+%define %%A_LEN %4
+%define %%AAD_HASH xmm0
+%define %%SUBHASH xmm1
+
+
+ movdqu %%SUBHASH, [%%GDATA + HashKey]
+
+ CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%SUBHASH, xmm2, xmm3, xmm4, xmm5, xmm6, r10, r11, r12, r13, rax
+ pxor xmm2, xmm3
+ mov r10, %%A_LEN
+
+ movdqu [%%GDATA + AadHash], %%AAD_HASH ; my_ctx_data.aad hash = aad_hash
+ mov [%%GDATA + AadLen], r10 ; my_ctx_data.aad_length = aad_length
+ xor r10, r10
+ mov [%%GDATA + InLen], r10 ; my_ctx_data.in_length = 0
+ mov [%%GDATA + PBlockLen], r10 ; my_ctx_data.partial_block_length = 0
+ movdqu [%%GDATA + PBlockEncKey], xmm2 ; my_ctx_data.partial_block_enc_key = 0
+ mov r10, %%IV
+ movdqu xmm2, [r10]
+ movdqu [%%GDATA + OrigIV], xmm2 ; my_ctx_data.orig_IV = iv
+
+ pshufb xmm2, [SHUF_MASK]
+
+ movdqu [%%GDATA + CurCount], xmm2 ; my_ctx_data.current_counter = iv
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_data struct has been
+; initialized by GCM_INIT
+; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
+; Input: gcm_data struct* (GDATA), input text (PLAIN_CYPH_IN), input text length (PLAIN_CYPH_LEN),
+; and whether encoding or decoding (ENC_DEC)
+; Output: A cypher of the given plain text (CYPH_PLAIN_OUT), and updated GDATA
+; Clobbers rax, r10-r15, and xmm0-xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_ENC_DEC 5
+%define %%GDATA %1
+%define %%CYPH_PLAIN_OUT %2
+%define %%PLAIN_CYPH_IN %3
+%define %%PLAIN_CYPH_LEN %4
+%define %%ENC_DEC %5
+%define %%DATA_OFFSET r11
+
+; Macro flow:
+; calculate the number of 16byte blocks in the message
+; process (number of 16byte blocks) mod 8 '%%_initial_num_blocks_is_# .. %%_initial_blocks_encrypted'
+; process 8 16 byte blocks at a time until all are done '%%_encrypt_by_8_new .. %%_eight_cipher_left'
+; if there is a block of less tahn 16 bytes process it '%%_zero_cipher_left .. %%_multiple_of_16_bytes'
+
+ cmp %%PLAIN_CYPH_LEN, 0
+ je %%_multiple_of_16_bytes
+
+ xor %%DATA_OFFSET, %%DATA_OFFSET
+ add [%%GDATA+InLen], %%PLAIN_CYPH_LEN ;Update length of data processed
+ movdqu xmm13, [%%GDATA + HashKey] ; xmm13 = HashKey
+ movdqu xmm8, [%%GDATA + AadHash]
+
+
+ PARTIAL_BLOCK %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%PLAIN_CYPH_LEN, %%DATA_OFFSET, xmm8, %%ENC_DEC
+
+ mov r13, %%PLAIN_CYPH_LEN ; save the number of bytes of plaintext/ciphertext
+ sub r13, %%DATA_OFFSET
+ mov r10, r13 ;save the amount of data left to process in r10
+ and r13, -16 ; r13 = r13 - (r13 mod 16)
+
+ mov r12, r13
+ shr r12, 4
+ and r12, 7
+ jz %%_initial_num_blocks_is_0
+
+ cmp r12, 7
+ je %%_initial_num_blocks_is_7
+ cmp r12, 6
+ je %%_initial_num_blocks_is_6
+ cmp r12, 5
+ je %%_initial_num_blocks_is_5
+ cmp r12, 4
+ je %%_initial_num_blocks_is_4
+ cmp r12, 3
+ je %%_initial_num_blocks_is_3
+ cmp r12, 2
+ je %%_initial_num_blocks_is_2
+
+ jmp %%_initial_num_blocks_is_1
+
+%%_initial_num_blocks_is_7:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 7, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*7
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_6:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 6, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*6
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_5:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 5, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*5
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_4:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 4, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*4
+ jmp %%_initial_blocks_encrypted
+
+
+%%_initial_num_blocks_is_3:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 3, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*3
+ jmp %%_initial_blocks_encrypted
+%%_initial_num_blocks_is_2:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 2, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16*2
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_1:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 1, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+ sub r13, 16
+ jmp %%_initial_blocks_encrypted
+
+%%_initial_num_blocks_is_0:
+ INITIAL_BLOCKS %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, r13, %%DATA_OFFSET, 0, xmm12, xmm13, xmm14, xmm15, xmm11, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm10, xmm0, %%ENC_DEC
+
+
+%%_initial_blocks_encrypted:
+ cmp r13, 0
+ je %%_zero_cipher_left
+
+ sub r13, 128
+ je %%_eight_cipher_left
+
+
+
+
+ movd r15d, xmm9
+ and r15d, 255
+ pshufb xmm9, [SHUF_MASK]
+
+
+%%_encrypt_by_8_new:
+ cmp r15d, 255-8
+ jg %%_encrypt_by_8
+
+
+
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, out_order, %%ENC_DEC
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+ jmp %%_eight_cipher_left
+
+%%_encrypt_by_8:
+ pshufb xmm9, [SHUF_MASK]
+ add r15b, 8
+ GHASH_8_ENCRYPT_8_PARALLEL %%GDATA, %%CYPH_PLAIN_OUT, %%PLAIN_CYPH_IN, %%DATA_OFFSET, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm9, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm15, in_order, %%ENC_DEC
+ pshufb xmm9, [SHUF_MASK]
+ add %%DATA_OFFSET, 128
+ sub r13, 128
+ jne %%_encrypt_by_8_new
+
+ pshufb xmm9, [SHUF_MASK]
+
+
+
+
+%%_eight_cipher_left:
+ GHASH_LAST_8 %%GDATA, xmm0, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8
+
+
+%%_zero_cipher_left:
+ movdqu [%%GDATA + AadHash], xmm14
+ movdqu [%%GDATA + CurCount], xmm9
+
+ mov r13, r10
+ and r13, 15 ; r13 = (%%PLAIN_CYPH_LEN mod 16)
+
+ je %%_multiple_of_16_bytes
+
+ mov [%%GDATA + PBlockLen], r13 ; my_ctx.data.partial_blck_length = r13
+ ; handle the last <16 Byte block seperately
+
+ paddd xmm9, [ONE] ; INCR CNT to get Yn
+ movdqu [%%GDATA + CurCount], xmm9 ; my_ctx.data.current_counter = xmm9
+ pshufb xmm9, [SHUF_MASK]
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9, xmm2 ; E(K, Yn)
+ movdqu [%%GDATA + PBlockEncKey], xmm9 ; my_ctx_data.partial_block_enc_key = xmm9
+
+ cmp %%PLAIN_CYPH_LEN, 16
+ jge %%_large_enough_update
+
+ lea r10, [%%PLAIN_CYPH_IN + %%DATA_OFFSET]
+ READ_SMALL_DATA_INPUT xmm1, r10, r13, r12, r15, rax
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13
+ jmp %%_data_read
+
+%%_large_enough_update:
+ sub %%DATA_OFFSET, 16
+ add %%DATA_OFFSET, r13
+
+ movdqu xmm1, [%%PLAIN_CYPH_IN+%%DATA_OFFSET] ; receive the last <16 Byte block
+
+ sub %%DATA_OFFSET, r13
+ add %%DATA_OFFSET, 16
+
+ lea r12, [SHIFT_MASK + 16]
+ sub r12, r13 ; adjust the shuffle mask pointer to be able to shift 16-r13 bytes (r13 is the number of bytes in plaintext mod 16)
+ movdqu xmm2, [r12] ; get the appropriate shuffle mask
+ pshufb xmm1, xmm2 ; shift right 16-r13 bytes
+%%_data_read:
+ %ifidn %%ENC_DEC, DEC
+ movdqa xmm2, xmm1
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pand xmm2, xmm1
+ pshufb xmm2, [SHUF_MASK]
+ pxor xmm14, xmm2
+ movdqu [%%GDATA + AadHash], xmm14
+
+ %else
+ pxor xmm9, xmm1 ; Plaintext XOR E(K, Yn)
+ movdqu xmm1, [r12 + ALL_F - SHIFT_MASK] ; get the appropriate mask to mask out top 16-r13 bytes of xmm9
+ pand xmm9, xmm1 ; mask out top 16-r13 bytes of xmm9
+ pshufb xmm9, [SHUF_MASK]
+ pxor xmm14, xmm9
+ movdqu [%%GDATA + AadHash], xmm14
+
+ pshufb xmm9, [SHUF_MASK] ; shuffle xmm9 back to output as ciphertext
+ %endif
+
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ ; output r13 Bytes
+ movq rax, xmm9
+ cmp r13, 8
+ jle %%_less_than_8_bytes_left
+
+ mov [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], rax
+ add %%DATA_OFFSET, 8
+ psrldq xmm9, 8
+ movq rax, xmm9
+ sub r13, 8
+
+%%_less_than_8_bytes_left:
+ mov BYTE [%%CYPH_PLAIN_OUT + %%DATA_OFFSET], al
+ add %%DATA_OFFSET, 1
+ shr rax, 8
+ sub r13, 1
+ jne %%_less_than_8_bytes_left
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%%_multiple_of_16_bytes:
+
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; GCM_COMPLETE Finishes Encyrption/Decryption of last partial block after GCM_UPDATE finishes.
+; Input: A gcm_data struct* (GDATA) and whether encoding or decoding (ENC_DEC).
+; Output: Authorization Tag (AUTH_TAG) and Authorization Tag length (AUTH_TAG_LEN)
+; Clobbers rax, r10-r12, and xmm0, xmm1, xmm5, xmm6, xmm9, xmm11, xmm14, xmm15
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro GCM_COMPLETE 4
+%define %%GDATA %1
+%define %%AUTH_TAG %2
+%define %%AUTH_TAG_LEN %3
+%define %%ENC_DEC %4
+%define %%PLAIN_CYPH_LEN rax
+
+ mov r12, [%%GDATA + PBlockLen] ; r12 = aadLen (number of bytes)
+ movdqu xmm14, [%%GDATA + AadHash]
+ movdqu xmm13, [%%GDATA + HashKey]
+
+ cmp r12, 0
+
+ je %%_partial_done
+
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block
+ movdqu [%%GDATA+AadHash], xmm14
+
+%%_partial_done:
+
+ mov r12, [%%GDATA + AadLen] ; r12 = aadLen (number of bytes)
+ mov %%PLAIN_CYPH_LEN, [%%GDATA + InLen]
+
+ shl r12, 3 ; convert into number of bits
+ movd xmm15, r12d ; len(A) in xmm15
+
+ shl %%PLAIN_CYPH_LEN, 3 ; len(C) in bits (*128)
+ movq xmm1, %%PLAIN_CYPH_LEN
+ pslldq xmm15, 8 ; xmm15 = len(A)|| 0x0000000000000000
+ pxor xmm15, xmm1 ; xmm15 = len(A)||len(C)
+
+ pxor xmm14, xmm15
+ GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ; final GHASH computation
+ pshufb xmm14, [SHUF_MASK] ; perform a 16Byte swap
+
+ movdqu xmm9, [%%GDATA + OrigIV] ; xmm9 = Y0
+
+ ENCRYPT_SINGLE_BLOCK %%GDATA, xmm9, xmm2 ; E(K, Y0)
+
+ pxor xmm9, xmm14
+
+
+
+%%_return_T:
+ mov r10, %%AUTH_TAG ; r10 = authTag
+ mov r11, %%AUTH_TAG_LEN ; r11 = auth_tag_len
+
+ cmp r11, 16
+ je %%_T_16
+
+ cmp r11, 12
+ je %%_T_12
+
+%%_T_8:
+ movq rax, xmm9
+ mov [r10], rax
+ jmp %%_return_T_done
+%%_T_12:
+ movq rax, xmm9
+ mov [r10], rax
+ psrldq xmm9, 8
+ movd eax, xmm9
+ mov [r10 + 8], eax
+ jmp %%_return_T_done
+
+%%_T_16:
+ movdqu [r10], xmm9
+
+%%_return_T_done:
+%endmacro ;GCM_COMPLETE
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_precomp_sse
+; (gcm_data *my_ctx_data);
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_precomp_sse
+aesni_gcm256_precomp_sse:
+ push r12
+ push r13
+ push r14
+ push r15
+
+ mov r14, rsp
+
+
+
+ sub rsp, VARIABLE_OFFSET
+ and rsp, ~63 ; align rsp to 64 bytes
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; only xmm6 needs to be maintained
+ movdqu [rsp + LOCAL_STORAGE + 0*16],xmm6
+%endif
+
+ pxor xmm6, xmm6
+ ENCRYPT_SINGLE_BLOCK arg1, xmm6, xmm2 ; xmm6 = HashKey
+
+ pshufb xmm6, [SHUF_MASK]
+ ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;;
+ movdqa xmm2, xmm6
+ psllq xmm6, 1
+ psrlq xmm2, 63
+ movdqa xmm1, xmm2
+ pslldq xmm2, 8
+ psrldq xmm1, 8
+ por xmm6, xmm2
+ ;reduction
+ pshufd xmm2, xmm1, 00100100b
+ pcmpeqd xmm2, [TWOONE]
+ pand xmm2, [POLY]
+ pxor xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+ movdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly
+
+
+ PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6, [rsp + LOCAL_STORAGE + 0*16]
+%endif
+ mov rsp, r14
+
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_init_sse(
+; gcm_data *my_ctx_data,
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len); /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_init_sse
+aesni_gcm256_init_sse:
+ push r12
+ push r13
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 1*16
+ movdqu [rsp + 0*16],xmm6
+%endif
+
+ GCM_INIT arg1, arg2, arg3, arg4
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 1*16
+%endif
+ pop r13
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_update_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_update_sse
+aesni_gcm256_enc_update_sse:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_update_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Cyphertext input */
+; u64 plaintext_len); /* Length of data in Bytes for encryption. must be a multiple of 16 bytes*/
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_update_sse
+aesni_gcm256_dec_update_sse:
+
+ FUNC_SAVE
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ FUNC_RESTORE
+
+ ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_finalize_sse(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_finalize_sse
+aesni_gcm256_enc_finalize_sse:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, ENC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp+ 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_finalize_sse(
+; gcm_data *my_ctx_data,
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_finalize_sse
+aesni_gcm256_dec_finalize_sse:
+
+ push r12
+
+%ifidn __OUTPUT_FORMAT__, win64
+ ; xmm6:xmm15 need to be maintained for Windows
+ sub rsp, 5*16
+ movdqu [rsp + 0*16],xmm6
+ movdqu [rsp + 1*16],xmm9
+ movdqu [rsp + 2*16],xmm11
+ movdqu [rsp + 3*16],xmm14
+ movdqu [rsp + 4*16],xmm15
+%endif
+ GCM_COMPLETE arg1, arg2, arg3, DEC
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqu xmm15 , [rsp + 4*16]
+ movdqu xmm14 , [rsp+ 3*16]
+ movdqu xmm11 , [rsp + 2*16]
+ movdqu xmm9 , [rsp + 1*16]
+ movdqu xmm6 , [rsp + 0*16]
+ add rsp, 5*16
+%endif
+
+ pop r12
+ret
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_enc_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
+; const u8 *in, /* Plaintext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_enc_sse
+aesni_gcm256_enc_sse:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, ENC
+
+ GCM_COMPLETE arg1, arg8, arg9, ENC
+
+ FUNC_RESTORE
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;void aesni_gcm256_dec_sse(
+; gcm_data *my_ctx_data,
+; u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
+; const u8 *in, /* Ciphertext input */
+; u64 plaintext_len, /* Length of data in Bytes for encryption. */
+; u8 *iv, /* Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer. */
+; const u8 *aad, /* Additional Authentication Data (AAD)*/
+; u64 aad_len, /* Length of AAD in bytes (must be a multiple of 4 bytes). */
+; u8 *auth_tag, /* Authenticated Tag output. */
+; u64 auth_tag_len); /* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+global aesni_gcm256_dec_sse
+aesni_gcm256_dec_sse:
+
+ FUNC_SAVE
+
+ GCM_INIT arg1, arg5, arg6, arg7
+
+ GCM_ENC_DEC arg1, arg2, arg3, arg4, DEC
+
+ GCM_COMPLETE arg1, arg8, arg9, DEC
+
+ FUNC_RESTORE
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm
new file mode 100644
index 000000000..d7df9712f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_defines.asm
@@ -0,0 +1,163 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;
+; Authors:
+; Erdinc Ozturk
+; Vinodh Gopal
+; James Guilford
+
+
+;;;;;;
+; Remove the need for different yasm commandlines on Linux vs Windows
+%ifidn __OUTPUT_FORMAT__, elf64
+%define LINUX
+%else
+%define WIN_ABI
+%endif
+
+
+section .data
+
+align 16
+
+POLY dq 0x0000000000000001, 0xC200000000000000
+POLY2 dq 0x00000001C2000000, 0xC200000000000000
+TWOONE dq 0x0000000000000001, 0x0000000100000000
+
+; order of these constants should not change.
+; more specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
+
+SHUF_MASK dq 0x08090A0B0C0D0E0F, 0x0001020304050607
+SHIFT_MASK dq 0x0706050403020100, 0x0f0e0d0c0b0a0908
+ALL_F dq 0xffffffffffffffff, 0xffffffffffffffff
+ZERO dq 0x0000000000000000, 0x0000000000000000
+ONE dq 0x0000000000000001, 0x0000000000000000
+ONEf dq 0x0000000000000000, 0x0100000000000000
+
+section .text
+
+
+;;define the fields of gcm_data struct
+;typedef struct gcm_data
+;{
+; u8 expanded_keys[16*15];
+; u8 shifted_hkey_1[16]; // store HashKey <<1 mod poly here
+; u8 shifted_hkey_2[16]; // store HashKey^2 <<1 mod poly here
+; u8 shifted_hkey_3[16]; // store HashKey^3 <<1 mod poly here
+; u8 shifted_hkey_4[16]; // store HashKey^4 <<1 mod poly here
+; u8 shifted_hkey_5[16]; // store HashKey^5 <<1 mod poly here
+; u8 shifted_hkey_6[16]; // store HashKey^6 <<1 mod poly here
+; u8 shifted_hkey_7[16]; // store HashKey^7 <<1 mod poly here
+; u8 shifted_hkey_8[16]; // store HashKey^8 <<1 mod poly here
+; u8 shifted_hkey_1_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_2_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_3_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_4_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_5_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_6_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_7_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+; u8 shifted_hkey_8_k[16]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+;} gcm_data;
+
+%define HashKey 16*15 ; store HashKey <<1 mod poly here
+%define HashKey_2 16*16 ; store HashKey^2 <<1 mod poly here
+%define HashKey_3 16*17 ; store HashKey^3 <<1 mod poly here
+%define HashKey_4 16*18 ; store HashKey^4 <<1 mod poly here
+%define HashKey_5 16*19 ; store HashKey^5 <<1 mod poly here
+%define HashKey_6 16*20 ; store HashKey^6 <<1 mod poly here
+%define HashKey_7 16*21 ; store HashKey^7 <<1 mod poly here
+%define HashKey_8 16*22 ; store HashKey^8 <<1 mod poly here
+%define HashKey_k 16*23 ; store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_2_k 16*24 ; store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_3_k 16*25 ; store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_4_k 16*26 ; store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_5_k 16*27 ; store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_6_k 16*28 ; store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_7_k 16*29 ; store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+%define HashKey_8_k 16*30 ; store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+%define AadHash 16*31 ; store current Hash of data which has been input
+%define AadLen 16*32 ; store length of input data which will not be encrypted or decrypted
+%define InLen 16*32+8 ; store length of input data which will be encrypted or decrypted
+%define PBlockEncKey 16*33 ; encryption key for the partial block at the end of the previous update
+%define OrigIV 16*34 ; input IV
+%define CurCount 16*35 ; Current counter for generation of encryption key
+%define PBlockLen 16*36 ; length of partial block at the end of the previous update
+
+%define reg(q) xmm %+ q
+
+
+
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ %xdefine arg1 rcx
+ %xdefine arg2 rdx
+ %xdefine arg3 r8
+ %xdefine arg4 r9
+ %xdefine arg5 [r14 + STACK_OFFSET + 8*5]
+ %xdefine arg6 [r14 + STACK_OFFSET + 8*6]
+ %xdefine arg7 [r14 + STACK_OFFSET + 8*7]
+ %xdefine arg8 [r14 + STACK_OFFSET + 8*8]
+ %xdefine arg9 [r14 + STACK_OFFSET + 8*9]
+
+%else
+ %xdefine arg1 rdi
+ %xdefine arg2 rsi
+ %xdefine arg3 rdx
+ %xdefine arg4 rcx
+ %xdefine arg5 r8
+ %xdefine arg6 r9
+ %xdefine arg7 [r14 + STACK_OFFSET + 8*1]
+ %xdefine arg8 [r14 + STACK_OFFSET + 8*2]
+ %xdefine arg9 [r14 + STACK_OFFSET + 8*3]
+
+%endif
+
+%ifdef NT_LDST
+ %define NT_LD
+ %define NT_ST
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_LD
+ %define XLDR movntdqa
+ %define VXLDR vmovntdqa
+%else
+ %define XLDR movdqu
+ %define VXLDR vmovdqu
+%endif
+
+;;; Use Non-temporal load/stor
+%ifdef NT_ST
+ %define XSTR movntdq
+ %define VXSTR vmovntdq
+%else
+ %define XSTR movdqu
+ %define VXSTR vmovdqu
+%endif
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm
new file mode 100644
index 000000000..e46066ac5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_multibinary.asm
@@ -0,0 +1,172 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern aesni_gcm128_init_sse
+extern aesni_gcm128_init_avx_gen4
+extern aesni_gcm128_init_avx_gen2
+
+extern aesni_gcm128_enc_sse
+extern aesni_gcm128_enc_avx_gen4
+extern aesni_gcm128_enc_avx_gen2
+extern aesni_gcm128_enc_update_sse
+extern aesni_gcm128_enc_update_avx_gen4
+extern aesni_gcm128_enc_update_avx_gen2
+extern aesni_gcm128_enc_finalize_sse
+extern aesni_gcm128_enc_finalize_avx_gen4
+extern aesni_gcm128_enc_finalize_avx_gen2
+
+extern aesni_gcm128_dec_sse
+extern aesni_gcm128_dec_avx_gen4
+extern aesni_gcm128_dec_avx_gen2
+extern aesni_gcm128_dec_update_sse
+extern aesni_gcm128_dec_update_avx_gen4
+extern aesni_gcm128_dec_update_avx_gen2
+extern aesni_gcm128_dec_finalize_sse
+extern aesni_gcm128_dec_finalize_avx_gen4
+extern aesni_gcm128_dec_finalize_avx_gen2
+
+extern aesni_gcm128_precomp_sse
+extern aesni_gcm128_precomp_avx_gen4
+extern aesni_gcm128_precomp_avx_gen2
+
+
+
+extern aesni_gcm256_init_sse
+extern aesni_gcm256_init_avx_gen4
+extern aesni_gcm256_init_avx_gen2
+
+extern aesni_gcm256_enc_sse
+extern aesni_gcm256_enc_avx_gen4
+extern aesni_gcm256_enc_avx_gen2
+extern aesni_gcm256_enc_update_sse
+extern aesni_gcm256_enc_update_avx_gen4
+extern aesni_gcm256_enc_update_avx_gen2
+extern aesni_gcm256_enc_finalize_sse
+extern aesni_gcm256_enc_finalize_avx_gen4
+extern aesni_gcm256_enc_finalize_avx_gen2
+
+extern aesni_gcm256_dec_sse
+extern aesni_gcm256_dec_avx_gen4
+extern aesni_gcm256_dec_avx_gen2
+extern aesni_gcm256_dec_update_sse
+extern aesni_gcm256_dec_update_avx_gen4
+extern aesni_gcm256_dec_update_avx_gen2
+extern aesni_gcm256_dec_finalize_sse
+extern aesni_gcm256_dec_finalize_avx_gen4
+extern aesni_gcm256_dec_finalize_avx_gen2
+
+extern aesni_gcm256_precomp_sse
+extern aesni_gcm256_precomp_avx_gen4
+extern aesni_gcm256_precomp_avx_gen2
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface aesni_gcm128_init
+mbin_dispatch_init aesni_gcm128_init, aesni_gcm128_init_sse, aesni_gcm128_init_avx_gen2, aesni_gcm128_init_avx_gen4
+
+mbin_interface aesni_gcm128_enc
+mbin_dispatch_init aesni_gcm128_enc, aesni_gcm128_enc_sse, aesni_gcm128_enc_avx_gen2, aesni_gcm128_enc_avx_gen4
+
+mbin_interface aesni_gcm128_enc_update
+mbin_dispatch_init aesni_gcm128_enc_update, aesni_gcm128_enc_update_sse, aesni_gcm128_enc_update_avx_gen2, aesni_gcm128_enc_update_avx_gen4
+
+mbin_interface aesni_gcm128_enc_finalize
+mbin_dispatch_init aesni_gcm128_enc_finalize, aesni_gcm128_enc_finalize_sse, aesni_gcm128_enc_finalize_avx_gen2, aesni_gcm128_enc_finalize_avx_gen4
+
+mbin_interface aesni_gcm128_dec
+mbin_dispatch_init aesni_gcm128_dec, aesni_gcm128_dec_sse, aesni_gcm128_dec_avx_gen2, aesni_gcm128_dec_avx_gen4
+
+mbin_interface aesni_gcm128_dec_update
+mbin_dispatch_init aesni_gcm128_dec_update, aesni_gcm128_dec_update_sse, aesni_gcm128_dec_update_avx_gen2, aesni_gcm128_dec_update_avx_gen4
+
+mbin_interface aesni_gcm128_dec_finalize
+mbin_dispatch_init aesni_gcm128_dec_finalize, aesni_gcm128_dec_finalize_sse, aesni_gcm128_dec_finalize_avx_gen2, aesni_gcm128_dec_finalize_avx_gen4
+
+mbin_interface aesni_gcm128_precomp
+mbin_dispatch_init aesni_gcm128_precomp, aesni_gcm128_precomp_sse, aesni_gcm128_precomp_avx_gen2, aesni_gcm128_precomp_avx_gen4
+
+;;;;
+; instantiate aesni_gcm interfaces init, enc, enc_update, enc_finalize, dec, dec_update, dec_finalize and precomp
+;;;;
+mbin_interface aesni_gcm256_init
+mbin_dispatch_init aesni_gcm256_init, aesni_gcm256_init_sse, aesni_gcm256_init_avx_gen2, aesni_gcm256_init_avx_gen4
+
+mbin_interface aesni_gcm256_enc
+mbin_dispatch_init aesni_gcm256_enc, aesni_gcm256_enc_sse, aesni_gcm256_enc_avx_gen2, aesni_gcm256_enc_avx_gen4
+
+mbin_interface aesni_gcm256_enc_update
+mbin_dispatch_init aesni_gcm256_enc_update, aesni_gcm256_enc_update_sse, aesni_gcm256_enc_update_avx_gen2, aesni_gcm256_enc_update_avx_gen4
+
+mbin_interface aesni_gcm256_enc_finalize
+mbin_dispatch_init aesni_gcm256_enc_finalize, aesni_gcm256_enc_finalize_sse, aesni_gcm256_enc_finalize_avx_gen2, aesni_gcm256_enc_finalize_avx_gen4
+
+mbin_interface aesni_gcm256_dec
+mbin_dispatch_init aesni_gcm256_dec, aesni_gcm256_dec_sse, aesni_gcm256_dec_avx_gen2, aesni_gcm256_dec_avx_gen4
+
+mbin_interface aesni_gcm256_dec_update
+mbin_dispatch_init aesni_gcm256_dec_update, aesni_gcm256_dec_update_sse, aesni_gcm256_dec_update_avx_gen2, aesni_gcm256_dec_update_avx_gen4
+
+mbin_interface aesni_gcm256_dec_finalize
+mbin_dispatch_init aesni_gcm256_dec_finalize, aesni_gcm256_dec_finalize_sse, aesni_gcm256_dec_finalize_avx_gen2, aesni_gcm256_dec_finalize_avx_gen4
+
+mbin_interface aesni_gcm256_precomp
+mbin_dispatch_init aesni_gcm256_precomp, aesni_gcm256_precomp_sse, aesni_gcm256_precomp_avx_gen2, aesni_gcm256_precomp_avx_gen4
+
+
+;;; func core, ver, snum
+slversion aesni_gcm128_enc, 00, 00, 0280
+slversion aesni_gcm128_dec, 00, 00, 0281
+slversion aesni_gcm128_init, 00, 00, 0282
+slversion aesni_gcm128_enc_update, 00, 00, 0283
+slversion aesni_gcm128_dec_update, 00, 00, 0284
+slversion aesni_gcm128_enc_finalize, 00, 00, 0285
+slversion aesni_gcm128_dec_finalize, 00, 00, 0286
+slversion aesni_gcm256_enc, 00, 00, 0288
+slversion aesni_gcm256_dec, 00, 00, 0289
+slversion aesni_gcm256_init, 00, 00, 028a
+slversion aesni_gcm256_enc_update, 00, 00, 028b
+slversion aesni_gcm256_dec_update, 00, 00, 028c
+slversion aesni_gcm256_enc_finalize, 00, 00, 028d
+slversion aesni_gcm256_dec_finalize, 00, 00, 028e
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c
new file mode 100644
index 000000000..a8e9f39ee
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_ossl_perf.c
@@ -0,0 +1,272 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include <test.h>
+#include "ossl_helper.h"
+#include "gcm_vectors.h"
+
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define AAD_LENGTH 16
+#define TEST_MEM TEST_LEN
+
+static unsigned char *plaintext, *gcm_plaintext, *cyphertext, *ossl_plaintext,
+ *ossl_cyphertext, *gcm_tag, *ossl_tag, *IV, *AAD;
+static uint8_t key128[GCM_128_KEY_LEN];
+static uint8_t key256[GCM_256_KEY_LEN];
+uint8_t iv_len = 0;
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ unsigned int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, int vect, char *data_name)
+{
+ int mismatch;
+ int OK = 1;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 0;
+ printf(" v[%d] expected results don't match %s \t\t", vect, data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+void aes_gcm_perf(void)
+{
+ struct gcm_data gdata;
+ struct gcm_data gdata256;
+ int i;
+
+ printf
+ ("AES GCM performace parameters plain text length:%d; IV length:%d; ADD length:%d \n",
+ TEST_LEN, GCM_IV_LEN, AAD_LENGTH);
+
+ mk_rand_data(key128, sizeof(key128));
+ mk_rand_data(key256, sizeof(key256));
+
+ // This is only required once for a given key
+ aesni_gcm128_pre(key128, &gdata);
+ aesni_gcm256_pre(key256, &gdata256);
+
+ // Preload code cache
+ aesni_gcm128_enc(&gdata, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH,
+ gcm_tag, MAX_TAG_LEN);
+ openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN,
+ plaintext, TEST_LEN, ossl_cyphertext);
+ check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0,
+ "ISA-L vs OpenSSL 128 key cypher text (C)");
+ check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 128 tag (T)");
+ aesni_gcm256_enc(&gdata256, cyphertext, plaintext, TEST_LEN, IV, AAD, AAD_LENGTH,
+ gcm_tag, MAX_TAG_LEN);
+ openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH, ossl_tag, MAX_TAG_LEN,
+ plaintext, TEST_LEN, ossl_cyphertext);
+ check_data(cyphertext, ossl_cyphertext, TEST_LEN, 0,
+ "ISA-L vs OpenSSL 256 cypher text (C)");
+ check_data(gcm_tag, ossl_tag, MAX_TAG_LEN, 0, "ISA-L vs OpenSSL 256 tag (T)");
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aesni_gcm128_enc(&gdata, cyphertext, plaintext, TEST_LEN, IV, AAD,
+ AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_gcm_enc(key128, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN,
+ cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aesni_gcm128_dec(&gdata, plaintext, cyphertext, TEST_LEN, IV,
+ AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0, "ISA-L check of tag (T)");
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm_dec" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_gcm_dec(key128, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN,
+ plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ printf("\n");
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aesni_gcm256_enc(&gdata256, cyphertext, plaintext, TEST_LEN, IV, AAD,
+ AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm256_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_gcm_enc(key256, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, plaintext, TEST_LEN,
+ cyphertext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_256_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ aesni_gcm256_dec(&gdata256, plaintext, cyphertext, TEST_LEN, IV,
+ AAD, AAD_LENGTH, gcm_tag, MAX_TAG_LEN);
+ check_data(gcm_tag, gcm_tag, MAX_TAG_LEN, 0,
+ "ISA-L check of 256 tag (T)");
+ }
+
+ perf_stop(&stop);
+ printf(" aes_gcm256_dec" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+ {
+ struct perf start, stop;
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ openssl_aes_256_gcm_dec(key256, IV, iv_len, AAD, AAD_LENGTH,
+ ossl_tag, MAX_TAG_LEN, cyphertext, TEST_LEN,
+ plaintext);
+ }
+
+ perf_stop(&stop);
+ printf("openssl_aes_256_gcm_enc" TEST_TYPE_STR ":\t");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+ }
+}
+
+int main(void)
+{
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint32_t OK = 1;
+
+ plaintext = malloc(TEST_LEN);
+ gcm_plaintext = malloc(TEST_LEN);
+ cyphertext = malloc(TEST_LEN);
+ ossl_plaintext = malloc(TEST_LEN + 16);
+ ossl_cyphertext = malloc(TEST_LEN);
+ gcm_tag = malloc(MAX_TAG_LEN);
+ ossl_tag = malloc(MAX_TAG_LEN);
+ AAD = malloc(AAD_LENGTH);
+ IV = malloc(GCM_IV_LEN);
+ if ((NULL == plaintext) || (NULL == cyphertext) || (NULL == gcm_plaintext)
+ || (NULL == ossl_plaintext) || (NULL == ossl_cyphertext)
+ || (NULL == gcm_tag) || (NULL == ossl_tag) || (NULL == AAD) || (NULL == IV)) {
+ printf("malloc of testsize:0x%x failed\n", TEST_LEN);
+ return -1;
+ }
+
+ mk_rand_data(plaintext, TEST_LEN);
+ mk_rand_data(AAD, AAD_LENGTH);
+ mk_rand_data(IV, GCM_IV_LEN);
+ memcpy(&IV[GCM_IV_END_START], IVend, sizeof(IVend));
+ iv_len = GCM_IV_LEN - sizeof(IVend); //end marker not part of IV length
+
+ aes_gcm_perf();
+ printf("AES gcm ISA-L vs OpenSSL performance\n");
+
+ return !OK;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c
new file mode 100644
index 000000000..7f0bbcba5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_pre.c
@@ -0,0 +1,71 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <aes_gcm.h>
+#include <aes_keyexp.h>
+
+void aes_keyexp_128_enc(uint8_t *, uint8_t *);
+void aesni_gcm128_precomp(struct gcm_data *my_ctx_data);
+void aesni_gcm256_precomp(struct gcm_data *my_ctx_data);
+
+void aesni_gcm128_pre(uint8_t * key, struct gcm_data *gdata)
+{
+ //////
+ // Prefill the key values for each round of encrypting/decrypting
+ // Prefill the Sub Hash key values for encoding the tag
+ //////
+ aes_keyexp_128_enc(key, (uint8_t *) gdata->expanded_keys);
+ aesni_gcm128_precomp(gdata);
+
+}
+
+void aesni_gcm256_pre(uint8_t * key, struct gcm_data *gdata)
+{
+ struct gcm_data tmp;
+ //////
+ // Prefill the key values for each round of encrypting/decrypting
+ // Prefill the Sub Hash key values for encoding the tag
+ //////
+ aes_keyexp_256(key, gdata->expanded_keys, tmp.expanded_keys);
+ aesni_gcm256_precomp(gdata);
+
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver aesni_gcm128_pre_slver_00000287;
+struct slver aesni_gcm128_pre_slver = { 0x0287, 0x00, 0x00 };
+
+struct slver aesni_gcm256_pre_slver_0000028f;
+struct slver aesni_gcm256_pre_slver = { 0x028f, 0x00, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c
new file mode 100644
index 000000000..e4b5b92fb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_random_test.c
@@ -0,0 +1,1937 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include <openssl/sha.h>
+#include "gcm_vectors.h"
+#include "ossl_helper.h"
+#include "types.h"
+
+//define GCM_VECTORS_VERBOSE
+//define GCM_VECTORS_EXTRA_VERBOSE
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#ifndef RANDOMS
+# define RANDOMS 200
+#endif
+#ifndef TEST_LEN
+# define TEST_LEN 32*1024
+#endif
+#ifndef PAGE_LEN
+# define PAGE_LEN (4*1024)
+#endif
+
+#if defined(NT_LD) || defined(NT_ST) || defined(NT_LDST)
+# define ALIGNMENT_MASK (~15)
+# define OFFSET_BASE_VALUE 16
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED (1)
+#endif
+#else
+# define ALIGNMENT_MASK (~0)
+# define OFFSET_BASE_VALUE 1
+#ifndef MAX_UNALIGNED
+# define MAX_UNALIGNED (16)
+#endif
+#endif
+
+
+void dump_table(char *title, uint8_t * table, uint8_t count)
+{
+ int i;
+ char const *space = " ";
+
+ printf("%s%s => {\n", space, title);
+ for (i = 0; i < count; i++) {
+ if (0 == (i & 15))
+ printf("%s%s", space, space);
+ printf("%2x, ", table[i]);
+ if (15 == (i & 15))
+ printf("\n");
+
+ }
+ printf("%s}\n", space);
+}
+
+void dump_gcm_data(struct gcm_data *gdata)
+{
+#ifdef GCM_VECTORS_EXTRA_VERBOSE
+ printf("gcm_data {\n");
+ dump_table("expanded_keys", gdata->expanded_keys, (16 * 11));
+ dump_table("shifted_hkey_1", gdata->shifted_hkey_1, 16);
+ dump_table("shifted_hkey_2", gdata->shifted_hkey_2, 16);
+ dump_table("shifted_hkey_3", gdata->shifted_hkey_3, 16);
+ dump_table("shifted_hkey_4", gdata->shifted_hkey_4, 16);
+ dump_table("shifted_hkey_5", gdata->shifted_hkey_5, 16);
+ dump_table("shifted_hkey_6", gdata->shifted_hkey_6, 16);
+ dump_table("shifted_hkey_7", gdata->shifted_hkey_7, 16);
+ dump_table("shifted_hkey_8", gdata->shifted_hkey_8, 16);
+ dump_table("shifted_hkey_1_k", gdata->shifted_hkey_1_k, 16);
+ dump_table("shifted_hkey_2_k", gdata->shifted_hkey_2_k, 16);
+ dump_table("shifted_hkey_3_k", gdata->shifted_hkey_3_k, 16);
+ dump_table("shifted_hkey_4_k", gdata->shifted_hkey_4_k, 16);
+ dump_table("shifted_hkey_5_k", gdata->shifted_hkey_5_k, 16);
+ dump_table("shifted_hkey_6_k", gdata->shifted_hkey_6_k, 16);
+ dump_table("shifted_hkey_7_k", gdata->shifted_hkey_7_k, 16);
+ dump_table("shifted_hkey_8_k", gdata->shifted_hkey_8_k, 16);
+ printf("}\n");
+#endif //GCM_VECTORS_VERBOSE
+}
+
+void mk_rand_data(uint8_t * data, uint32_t size)
+{
+ int i;
+ for (i = 0; i < size; i++) {
+ *data++ = rand();
+ }
+}
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 1;
+ printf(" expected results don't match %s \t\t", data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+int check_vector(struct gcm_data *gdata, gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_enc(gdata, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm128_dec(gdata, vector->P, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm128_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+
+ return OK;
+}
+
+int check_strm_vector(struct gcm_data *gdata, gcm_vector * vector, int test_len)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break;
+ int i;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+
+ last_break = 0;
+ i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ if (gdata->in_length != vector->Plen)
+ printf("%lu, %lu\n", gdata->in_length, vector->Plen);
+ aesni_gcm128_enc_finalize(gdata, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = 0;
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (rand() % (test_len / 64) == 0) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_dec_finalize(gdata, vector->T, vector->Tlen);
+
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm128_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+ free(rand_data);
+
+ return OK;
+}
+
+int check_strm_vector2(struct gcm_data *gdata, gcm_vector * vector, int length, int start,
+ int breaks)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break = 0;
+ int i = length;
+ uint8_t *rand_data = NULL;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_enc(gdata, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, stream, i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+ last_break = i;
+ i = i + (length - start) / breaks;
+
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_enc_finalize(gdata, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = length;
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, stream, i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+ last_break = i;
+ i = i + (length - start) / breaks;
+
+ }
+
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_dec_finalize(gdata, vector->T, vector->Tlen);
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm128_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(rand_data);
+
+ return OK;
+}
+
+int check_strm_vector_efence(struct gcm_data *gdata, gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break = 0;
+ int i = 1;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < vector->Plen) {
+ if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+ stream = malloc(PAGE_LEN);
+ i = i & ALIGNMENT_MASK;
+ memcpy(stream + PAGE_LEN - (i - last_break), vector->P + last_break,
+ i - last_break);
+ aesni_gcm128_enc_update(gdata, vector->C + last_break,
+ stream + PAGE_LEN - (i - last_break),
+ i - last_break);
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aesni_gcm128_enc_update(gdata, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_enc_finalize(gdata, vector->T, vector->Tlen);
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i = 0;
+ aesni_gcm128_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < vector->Plen) {
+ if (rand() % 2000 == 0 || i - last_break > PAGE_LEN / 2) {
+ stream = malloc(PAGE_LEN);
+ i = i & ALIGNMENT_MASK;
+ memcpy(stream + PAGE_LEN - (i - last_break), vector->C + last_break,
+ i - last_break);
+ aesni_gcm128_dec_update(gdata, vector->P + last_break,
+ stream + PAGE_LEN - (i - last_break),
+ i - last_break);
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+
+ }
+ if (rand() % 1024 != 0)
+ i++;
+
+ }
+ aesni_gcm128_dec_update(gdata, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aesni_gcm128_dec_finalize(gdata, vector->T, vector->Tlen);
+
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm128_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+ free(rand_data);
+
+ return OK;
+}
+
+int check_256_vector(struct gcm_data *gdata, gcm_vector * vector)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm256_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm256_enc(gdata, vector->C, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm256_dec(gdata, vector->P, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted ISA-L plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm256_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted OpenSSL plain text (P)");
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+
+ return OK;
+}
+
+int check_256_strm_vector(struct gcm_data *gdata, gcm_vector * vector, int test_len)
+{
+ uint8_t *pt_test = NULL;
+ uint8_t *ct_test = NULL;
+ uint8_t *o_ct_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *o_T_test = NULL;
+ uint8_t *stream = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+ int OK = 0;
+ uint32_t last_break;
+ int i;
+ uint8_t *rand_data = NULL;
+ uint64_t length;
+
+ rand_data = malloc(100);
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("combination vector Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ (int)vector->Klen,
+ (int)vector->IVlen, (int)vector->Plen, (int)vector->Alen, (int)vector->Tlen);
+#else
+ printf(".");
+#endif
+ // Allocate space for the calculated ciphertext
+ if (vector->Plen != 0) {
+ pt_test = malloc(vector->Plen);
+ ct_test = malloc(vector->Plen);
+ o_ct_test = malloc(vector->Plen);
+ if ((pt_test == NULL) || (ct_test == NULL) || (o_ct_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ o_T_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (o_T_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm256_pre(vector->K, gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm256_init(gdata, IV_c, vector->A, vector->Alen);
+
+ last_break = 0;
+ i = (rand() % test_len / 32) & ALIGNMENT_MASK;
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->P + last_break, i - last_break);
+ }
+
+ aesni_gcm256_enc_update(gdata, vector->C + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+ last_break = i;
+ i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+ }
+ aesni_gcm256_enc_update(gdata, vector->C + last_break, vector->P + last_break,
+ vector->Plen - last_break);
+ if (gdata->in_length != vector->Plen)
+ printf("%lu, %lu\n", gdata->in_length, vector->Plen);
+ aesni_gcm256_enc_finalize(gdata, vector->T, vector->Tlen);
+
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen, o_T_test,
+ vector->Tlen, vector->P, vector->Plen, o_ct_test);
+ OK |=
+ check_data(vector->C, o_ct_test, vector->Plen, "OpenSSL vs ISA-L cypher text (C)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L encrypt tag (T)");
+
+ memcpy(ct_test, vector->C, vector->Plen);
+ memcpy(pt_test, vector->P, vector->Plen);
+ memset(vector->P, 0, vector->Plen);
+ memcpy(T_test, vector->T, vector->Tlen);
+ memset(vector->T, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+
+ last_break = 0;
+ i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+ aesni_gcm256_init(gdata, IV_c, vector->A, vector->Alen);
+ while (i < (vector->Plen)) {
+ if (i - last_break != 0) {
+ stream = malloc(i - last_break);
+ memcpy(stream, vector->C + last_break, i - last_break);
+ }
+
+ aesni_gcm256_dec_update(gdata, vector->P + last_break, stream,
+ i - last_break);
+ if (i - last_break != 0)
+ free(stream);
+
+ if (rand() % 1024 == 0) {
+ length = rand() % 100;
+
+ mk_rand_data(rand_data, length);
+ SHA1(rand_data, length, rand_data);
+ }
+
+ last_break = i;
+ i += (rand() % test_len / 32) & ALIGNMENT_MASK;
+
+
+ }
+ aesni_gcm256_dec_update(gdata, vector->P + last_break, vector->C + last_break,
+ vector->Plen - last_break);
+ aesni_gcm256_dec_finalize(gdata, vector->T, vector->Tlen);
+
+ OK |= check_data(vector->T, T_test, vector->Tlen, "ISA-L decrypt vs encrypt tag (T)");
+ OK |=
+ check_data(vector->T, o_T_test, vector->Tlen, "OpenSSL vs ISA-L decrypt tag (T)");
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted ISA-L plain text (P)");
+ memset(vector->P, 0, vector->Plen);
+ aesni_gcm256_dec(gdata, vector->P, o_ct_test, vector->Plen,
+ IV_c, vector->A, vector->Alen, vector->T, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L decrypted OpenSSL plain text (P)");
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A, vector->Alen,
+ vector->T, vector->Tlen, vector->C, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ free(T_test);
+ free(o_T_test);
+ free(IV_c);
+ free(pt_test);
+ free(ct_test);
+ free(o_ct_test);
+
+ return OK;
+}
+
+int test_gcm_strm_efence(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_data *gdata = NULL;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES GCM random efence test vectors with random stream:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector_efence(gdata, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdata);
+ return 0;
+}
+
+int test_gcm_strm_combinations(int test_len)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ uint8_t *gdatatemp = NULL;
+ struct gcm_data *gdata = NULL;
+
+ gdatatemp = malloc(sizeof(struct gcm_data) + 16);
+ gdata = (struct gcm_data *)(gdatatemp + rand() % 16);
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES GCM random test vectors with random stream of average size %d:",
+ test_len / 64);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = 0; // (rand() % test_len);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % test_len);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector(gdata, &test, test_len))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdatatemp);
+ return 0;
+}
+
+int test_gcm_combinations(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_data *gdata = NULL;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES GCM random test vectors:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_vector(gdata, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdata);
+ return 0;
+}
+
+int test_gcm256_combinations(void)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_data *gdata = NULL;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES-GCM-256 random test vectors:");
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % TEST_LEN);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_256_KEY_LEN + offset);
+ test.Klen = GCM_256_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_256_vector(gdata, &test))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdata);
+ return 0;
+}
+
+int test_gcm256_strm_combinations(int test_len)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ uint8_t *gdatatemp = NULL;
+ struct gcm_data *gdata = NULL;
+
+ gdatatemp = malloc(sizeof(struct gcm_data) + 16);
+ gdata = (struct gcm_data *)(gdatatemp + rand() % 16);
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES-GCM-256 random test vectors with random stream of average size %d:",
+ test_len / 64);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = (rand() % test_len);
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % test_len);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_256_KEY_LEN + offset);
+ test.Klen = GCM_256_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_256_strm_vector(gdata, &test, test_len))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdatatemp);
+ return 0;
+}
+
+//
+// place all data to end at a page boundary to check for read past the end
+//
+int test_gcm_efence(void)
+{
+ gcm_vector test;
+ int offset = 0;
+ gcm_key_size key_len;
+ struct gcm_data *gdata = NULL;
+ uint8_t *P, *C, *K, *IV, *A, *T;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ P = malloc(PAGE_LEN);
+ C = malloc(PAGE_LEN);
+ K = malloc(PAGE_LEN);
+ IV = malloc(PAGE_LEN);
+ A = malloc(PAGE_LEN);
+ T = malloc(PAGE_LEN);
+ if ((NULL == P) || (NULL == C) || (NULL == K) || (NULL == IV) || (NULL == A)
+ || (NULL == T) || (NULL == gdata)) {
+ printf("malloc of testsize:0x%x failed\n", PAGE_LEN);
+ return -1;
+ }
+
+ test.Plen = PAGE_LEN / 2;
+ // place buffers to end at page boundary
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.Alen = test.Plen;
+ test.Tlen = MAX_TAG_LEN;
+
+ printf("AES GCM efence test vectors:");
+ for (key_len = GCM_128_KEY_LEN; GCM_256_KEY_LEN >= key_len;
+ key_len += (GCM_256_KEY_LEN - GCM_128_KEY_LEN)) {
+ test.Klen = key_len;
+ for (offset = 0; MAX_UNALIGNED > offset; offset++) {
+ if (0 == (offset % 80))
+ printf("\n");
+ // move the start and size of the data block towards the end of the page
+ test.Plen = (PAGE_LEN / 2) - offset;
+ test.Alen = (PAGE_LEN / 4) - (offset * 4); //lengths must be a multiple of 4 bytes
+ //Place data at end of page
+ test.P = P + PAGE_LEN - test.Plen;
+ test.C = C + PAGE_LEN - test.Plen;
+ test.K = K + PAGE_LEN - test.Klen;
+ test.IV = IV + PAGE_LEN - test.IVlen;
+ test.A = A + PAGE_LEN - test.Alen;
+ test.T = T + PAGE_LEN - test.Tlen;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+ if (GCM_128_KEY_LEN == key_len) {
+ if (0 != check_vector(gdata, &test))
+ return 1;
+ } else {
+ if (0 != check_256_vector(gdata, &test))
+ return 1;
+ }
+ }
+ }
+ free(gdata);
+ free(P);
+ free(C);
+ free(K);
+ free(IV);
+ free(A);
+ free(T);
+
+ printf("\n");
+ return 0;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_data gdata;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("AES-GCM-128:\n");
+#endif
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (T2_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, &gdata);
+#ifdef GCM_VECTORS_VERBOSE
+ dump_gcm_data(&gdata);
+#endif
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, pt_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L tag (T)");
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aesni_gcm128_enc(&gdata, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm128_dec(&gdata, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aesni_gcm128_dec(&gdata, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm128_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+ // OpenSSl enc -> ISA-L dec
+ openssl_aes_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |=
+ check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm128_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "OpenSSL->ISA-L decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+ // ISA-L enc -> OpenSSl dec
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ result =
+ openssl_aes_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ ct_test, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_data gdata;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+ int result;
+
+#ifdef GCM_VECTORS_VERBOSE
+ printf("AES-GCM-256:\n");
+#endif
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if ((ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if (T_test == NULL) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm256_pre(vector->K, &gdata);
+#ifdef GCM_VECTORS_VERBOSE
+ dump_gcm_data(&gdata);
+#endif
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, pt_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |= check_data(ct_test, vector->C, vector->Tlen, "OpenSSL vs KA - cypher text (C)");
+ OK |= check_data(pt_test, vector->T, vector->Tlen, "OpenSSL vs KA - tag (T)");
+ OK |= check_data(pt_test, T_test, vector->Tlen, "OpenSSL vs ISA-L - tag (T)");
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aesni_gcm256_enc(&gdata, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm256_dec(&gdata, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aesni_gcm256_dec(&gdata, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm256_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+ // OpenSSl enc -> ISA-L dec
+ openssl_aes_256_gcm_enc(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ vector->P, vector->Plen, ct_test);
+ OK |=
+ check_data(ct_test, vector->C, vector->Plen, "OpenSSL encrypted cypher text (C)");
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm256_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "OpenSSL->ISA-L decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "OpenSSL->ISA-L decrypted tag (T)");
+ // ISA-L enc -> OpenSSl dec
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ result =
+ openssl_aes_256_gcm_dec(vector->K, vector->IV,
+ vector->IVlen, vector->A,
+ vector->Alen, T_test, vector->Tlen,
+ ct_test, vector->Plen, pt_test);
+ if (-1 == result)
+ printf(" ISA-L->OpenSSL decryption failed Authentication\n");
+ OK |= (-1 == result);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "OSSL decrypted plain text (C)");
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm_std_vectors(void)
+{
+ int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+ int vect;
+ int OK = 0;
+
+ printf("AES-GCM standard test vectors:\n");
+ for (vect = 0; ((vect < vectors_cnt) /*&& (1 == OK) */ ); vect++) {
+#ifdef GCM_VECTORS_VERBOSE
+ printf
+ ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+ (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+ (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+ printf(".");
+#endif
+
+ if (BITS_128 == gcm_vectors[vect].Klen) {
+ OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+ } else {
+ OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+ }
+ if (0 != OK)
+ return OK;
+ }
+ printf("\n");
+ return OK;
+}
+
+// The length of the data is set to length. The first stream is from 0 to start. After
+// that the data is broken into breaks chunks of equal size (except possibly the last
+// one due to divisibility).
+int test_gcm_strm_combinations2(int length, int start, int breaks)
+{
+ gcm_vector test;
+ int tag_len = 8;
+ int t = 0;
+ struct gcm_data *gdata = NULL;
+
+ gdata = malloc(sizeof(struct gcm_data));
+ if (NULL == gdata)
+ return 1;
+
+ printf("AES GCM random test vectors of length %d and stream with %d breaks:", length,
+ breaks + 1);
+ for (t = 0; RANDOMS > t; t++) {
+ int Plen = length;
+ //lengths must be a multiple of 4 bytes
+ int aad_len = (rand() % TEST_LEN);
+ int offset = (rand() % MAX_UNALIGNED);
+ if(offset == 0 && aad_len == 0)
+ offset = OFFSET_BASE_VALUE;
+
+ if (0 == (t % 25))
+ printf("\n");
+ if (0 == (t % 10))
+ fflush(0);
+ test.P = NULL;
+ test.C = NULL;
+ test.A = NULL;
+ test.T = NULL;
+ test.Plen = Plen;
+ if (test.Plen + offset != 0) {
+ test.P = malloc(test.Plen + offset);
+ test.C = malloc(test.Plen + offset);
+ } else { //This else clause is here becuase openssl 1.0.1k does not handle NULL pointers
+ test.P = malloc(16);
+ test.C = malloc(16);
+ }
+ test.K = malloc(GCM_128_KEY_LEN + offset);
+ test.Klen = GCM_128_KEY_LEN;
+ test.IV = malloc(GCM_IV_DATA_LEN + offset);
+ test.IVlen = GCM_IV_DATA_LEN;
+ test.A = malloc(aad_len + offset);
+
+ test.Alen = aad_len;
+ test.T = malloc(MAX_TAG_LEN + offset);
+
+ if ((NULL == test.P && test.Plen != 0) || (NULL == test.K)
+ || (NULL == test.IV)) {
+ printf("malloc of testsize:0x%x failed\n", Plen);
+ return 1;
+ }
+
+ test.P += offset;
+ test.C += offset;
+ test.K += offset;
+ test.IV += offset;
+ test.A += offset;
+ test.T += offset;
+
+ mk_rand_data(test.P, test.Plen);
+ mk_rand_data(test.K, test.Klen);
+ mk_rand_data(test.IV, test.IVlen);
+ mk_rand_data(test.A, test.Alen);
+
+ // single Key length of 128bits/16bytes supported
+ // single IV length of 96bits/12bytes supported
+ // Tag lengths of 8, 12 or 16
+ for (tag_len = 8; tag_len <= MAX_TAG_LEN;) {
+ test.Tlen = tag_len;
+ if (0 != check_strm_vector2(gdata, &test, length, start, breaks))
+ return 1;
+ tag_len += 4; //supported lengths are 8, 12 or 16
+ }
+ test.A -= offset;
+ free(test.A);
+ test.C -= offset;
+ free(test.C);
+ test.IV -= offset;
+ free(test.IV);
+ test.K -= offset;
+ free(test.K);
+ test.P -= offset;
+ free(test.P);
+ test.T -= offset;
+ free(test.T);
+ }
+ printf("\n");
+ free(gdata);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int errors = 0;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ errors += test_gcm_std_vectors();
+ errors += test_gcm256_combinations();
+ errors += test_gcm_combinations();
+ errors += test_gcm_efence();
+ errors += test_gcm256_strm_combinations(TEST_LEN);
+ errors += test_gcm_strm_combinations(TEST_LEN);
+ errors += test_gcm256_strm_combinations(1024);
+ errors += test_gcm_strm_combinations(1024);
+ errors += test_gcm_strm_efence();
+ errors += test_gcm_strm_combinations2(1024, 0, 1024);
+
+ if (0 == errors)
+ printf("...Pass\n");
+ else
+ printf("...Fail\n");
+
+ return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c
new file mode 100644
index 000000000..c5c6367b4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_std_vectors_test.c
@@ -0,0 +1,322 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h> // for memcmp
+#include <aes_gcm.h>
+#include "gcm_vectors.h"
+#include "types.h"
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+int check_data(uint8_t * test, uint8_t * expected, uint64_t len, char *data_name)
+{
+ int mismatch;
+ int OK = 0;
+
+ mismatch = memcmp(test, expected, len);
+ if (mismatch) {
+ OK = 1;
+ printf(" expected results don't match %s \t\t", data_name);
+ {
+ uint64_t a;
+ for (a = 0; a < len; a++) {
+ if (test[a] != expected[a]) {
+ printf(" '%x' != '%x' at %lx of %lx\n",
+ test[a], expected[a], a, len);
+ break;
+ }
+ }
+ }
+ }
+ return OK;
+}
+
+int test_gcm128_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_data gdata;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if ((T_test == NULL) || (T2_test == NULL)) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm128_pre(vector->K, &gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aesni_gcm128_enc(&gdata, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm128_dec(&gdata, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aesni_gcm128_dec(&gdata, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aesni_gcm128_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm128_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ memset(pt_test, 0, vector->Plen);
+
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm256_std_vectors(gcm_vector const *vector)
+{
+ struct gcm_data gdata;
+ int OK = 0;
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test = NULL;
+ uint8_t *pt_test = NULL;
+ uint8_t *IV_c = NULL;
+ uint8_t *T_test = NULL;
+ uint8_t *T2_test = NULL;
+ uint8_t const IVend[] = GCM_IV_END_MARK;
+ uint64_t IV_alloc_len = 0;
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vector->Plen);
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vector->Plen);
+ if ((ct_test == NULL) || (pt_test == NULL)) {
+ fprintf(stderr, "Can't allocate ciphertext or plaintext memory\n");
+ return 1;
+ }
+ IV_alloc_len = vector->IVlen + sizeof(IVend);
+ // Allocate space for the calculated ciphertext
+ IV_c = malloc(IV_alloc_len);
+ if (IV_c == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return 1;
+ }
+ //Add end marker to the IV data for ISA-L
+ memcpy(IV_c, vector->IV, vector->IVlen);
+ memcpy(&IV_c[vector->IVlen], IVend, sizeof(IVend));
+
+ T_test = malloc(vector->Tlen);
+ T2_test = malloc(vector->Tlen);
+ if (T_test == NULL) {
+ fprintf(stderr, "Can't allocate tag memory\n");
+ return 1;
+ }
+ // This is only required once for a given key
+ aesni_gcm256_pre(vector->K, &gdata);
+
+ ////
+ // ISA-l Encrypt
+ ////
+ memset(ct_test, 0, vector->Plen);
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->C, vector->Plen, "ISA-L encrypted cypher text (C)");
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L tag (T)");
+
+ // test of in-place encrypt
+ memcpy(pt_test, vector->P, vector->Plen);
+ aesni_gcm256_enc(&gdata, pt_test, pt_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->C, vector->Plen,
+ "ISA-L encrypted cypher text(in-place)");
+ memset(ct_test, 0, vector->Plen);
+ memset(T_test, 0, vector->Tlen);
+
+ ////
+ // ISA-l Decrypt
+ ////
+ aesni_gcm256_dec(&gdata, pt_test, vector->C, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(pt_test, vector->P, vector->Plen, "ISA-L decrypted plain text (P)");
+ // GCM decryption outputs a 16 byte tag value that must be verified against the expected tag value
+ OK |= check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T)");
+
+ // test in in-place decrypt
+ memcpy(ct_test, vector->C, vector->Plen);
+ aesni_gcm256_dec(&gdata, ct_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T_test, vector->Tlen);
+ OK |= check_data(ct_test, vector->P, vector->Plen, "ISA-L plain text (P) - in-place");
+ OK |=
+ check_data(T_test, vector->T, vector->Tlen, "ISA-L decrypted tag (T) - in-place");
+ // ISA-L enc -> ISA-L dec
+ aesni_gcm256_enc(&gdata, ct_test, vector->P, vector->Plen,
+ IV_c, vector->A, vector->Alen, T_test, vector->Tlen);
+ memset(pt_test, 0, vector->Plen);
+ aesni_gcm256_dec(&gdata, pt_test, ct_test, vector->Plen, IV_c,
+ vector->A, vector->Alen, T2_test, vector->Tlen);
+ OK |=
+ check_data(pt_test, vector->P, vector->Plen,
+ "ISA-L self decrypted plain text (P)");
+ OK |= check_data(T_test, T2_test, vector->Tlen, "ISA-L self decrypted tag (T)");
+
+ if (NULL != ct_test)
+ free(ct_test);
+ if (NULL != pt_test)
+ free(pt_test);
+ if (NULL != IV_c)
+ free(IV_c);
+ if (NULL != T_test)
+ free(T_test);
+ if (NULL != T2_test)
+ free(T2_test);
+
+ return OK;
+}
+
+int test_gcm_std_vectors(void)
+{
+ int const vectors_cnt = sizeof(gcm_vectors) / sizeof(gcm_vectors[0]);
+ int vect;
+ int OK = 0;
+
+ printf("AES-GCM standard test vectors:\n");
+ for (vect = 0; ((vect < vectors_cnt) /*&& (1 == OK) */ ); vect++) {
+#ifdef DEBUG
+ printf
+ ("Standard vector %d/%d Keylen:%d IVlen:%d PTLen:%d AADlen:%d Tlen:%d\n",
+ vect, vectors_cnt - 1, (int)gcm_vectors[vect].Klen,
+ (int)gcm_vectors[vect].IVlen, (int)gcm_vectors[vect].Plen,
+ (int)gcm_vectors[vect].Alen, (int)gcm_vectors[vect].Tlen);
+#else
+ printf(".");
+#endif
+
+ if (BITS_128 == gcm_vectors[vect].Klen) {
+ OK |= test_gcm128_std_vectors(&gcm_vectors[vect]);
+ } else {
+ OK |= test_gcm256_std_vectors(&gcm_vectors[vect]);
+ }
+ if (0 != OK)
+ return OK;
+ }
+ printf("\n");
+ return OK;
+}
+
+int main(int argc, char **argv)
+{
+ int errors = 0;
+ int seed;
+
+ if (argc == 1)
+ seed = TEST_SEED;
+ else
+ seed = atoi(argv[1]);
+
+ srand(seed);
+ printf("SEED: %d\n", seed);
+
+ errors = test_gcm_std_vectors();
+
+ if (0 == errors)
+ printf("...Pass\n");
+ else
+ printf("...Fail\n");
+
+ return errors;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h
new file mode 100644
index 000000000..c8bb34a57
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/gcm_vectors.h
@@ -0,0 +1,476 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_GCM_VECTORS_H_
+#define AES_GCM_VECTORS_H_
+
+#include <stdint.h>
+
+typedef enum gcm_key_size { BITS_128 = 16, BITS_256 = 32 } gcm_key_size;
+#define KBITS(K) (sizeof(K))
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+typedef struct gcm_vector {
+ uint8_t* K; // AES Key
+ gcm_key_size Klen; // length of key in bits
+ uint8_t* IV; // initial value used by GCM
+ uint64_t IVlen; // length of IV in bytes
+ uint8_t* A; // additional authenticated data
+ uint64_t Alen; // length of AAD in bytes
+ uint8_t* P; // Plain text
+ uint64_t Plen; // length of our plaintext
+ //outputs of encryption
+ uint8_t* C; // same length as PT
+ uint8_t* T; // Authenication tag
+ uint8_t Tlen; // AT length can be 0 to 128bits
+} gcm_vector;
+
+///////
+// 60-Byte Packet Encryption Using GCM-AES-128
+// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf
+// K: AD7A2BD03EAC835A6F620FDCB506B345
+// IV: 12153524C0895E81B2C28465
+// AAD: D609B1F056637A0D46DF998D88E52E00
+// B2C2846512153524C0895E81
+// P: 08000F101112131415161718191A1B1C
+// 1D1E1F202122232425262728292A2B2C
+// 2D2E2F303132333435363738393A0002
+// C: 701AFA1CC039C0D765128A665DAB6924
+// 3899BF7318CCDC81C9931DA17FBE8EDD
+// 7D17CB8B4C26FC81E3284F2B7FBA713D
+// AT: 4F8D55E7D3F06FD5A13C0C29B9D5B880
+// H: 73A23D80121DE2D5A850253FCF43120E
+///////
+static uint8_t K1[] = {0xAD, 0x7A, 0x2B, 0xD0, 0x3E, 0xAC, 0x83, 0x5A, 0x6F, 0x62, 0x0F, 0xDC, 0xB5, 0x06, 0xB3, 0x45};
+static uint8_t P1[] = {
+ 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C
+ , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C
+ , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x00, 0x02
+};
+static uint8_t IV1[] = {0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81, 0xB2, 0xC2, 0x84, 0x65};
+static uint8_t A1[] = {
+ 0xD6, 0x09, 0xB1, 0xF0, 0x56, 0x63, 0x7A, 0x0D, 0x46, 0xDF, 0x99, 0x8D, 0x88, 0xE5, 0x2E, 0x00
+ , 0xB2, 0xC2, 0x84, 0x65, 0x12, 0x15, 0x35, 0x24, 0xC0, 0x89, 0x5E, 0x81
+};
+#define A1_len sizeof(A1)
+static uint8_t C1[] = {
+ 0x70, 0x1A, 0xFA, 0x1C, 0xC0, 0x39, 0xC0, 0xD7, 0x65, 0x12, 0x8A, 0x66, 0x5D, 0xAB, 0x69, 0x24
+ , 0x38, 0x99, 0xBF, 0x73, 0x18, 0xCC, 0xDC, 0x81, 0xC9, 0x93, 0x1D, 0xA1, 0x7F, 0xBE, 0x8E, 0xDD
+ , 0x7D, 0x17, 0xCB, 0x8B, 0x4C, 0x26, 0xFC, 0x81, 0xE3, 0x28, 0x4F, 0x2B, 0x7F, 0xBA, 0x71, 0x3D
+};
+static uint8_t T1[] = {
+ 0x4F, 0x8D, 0x55, 0xE7, 0xD3, 0xF0, 0x6F, 0xD5, 0xA1, 0x3C, 0x0C, 0x29, 0xB9, 0xD5, 0xB8, 0x80
+};
+
+
+///////
+// 54-Byte Packet Encryption Using GCM-AES-128
+// http://www.ieee802.org/1/files/public/docs2011/bn-randall-test-vectors-0511-v1.pdf
+// K: 071B113B0CA743FECCCF3D051F737382
+// IV: F0761E8DCD3D000176D457ED
+// AAD: E20106D7CD0DF0761E8DCD3D88E54C2A
+// 76D457ED
+// P: 08000F101112131415161718191A1B1C
+// 1D1E1F202122232425262728292A2B2C
+// 2D2E2F30313233340004
+// C: 13B4C72B389DC5018E72A171DD85A5D3
+// 752274D3A019FBCAED09A425CD9B2E1C
+// 9B72EEE7C9DE7D52B3F3
+// AT: D6A5284F4A6D3FE22A5D6C2B960494C3
+// H: E4E01725D724C1215C7309AD34539257
+///////
+static uint8_t K2[] = {0x07, 0x1B, 0x11, 0x3B, 0x0C, 0xA7, 0x43, 0xFE, 0xCC, 0xCF, 0x3D, 0x05, 0x1F, 0x73, 0x73, 0x82};
+static uint8_t P2[] = {
+ 0x08, 0x00, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C
+ , 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C
+ , 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x00, 0x04
+};
+static uint8_t IV2[] = {0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x00, 0x01, 0x76, 0xD4, 0x57, 0xED};
+//static uint8_t IV1p[] = {0, 0, 0, 1};
+static uint8_t A2[] = {
+ 0xE2, 0x01, 0x06, 0xD7, 0xCD, 0x0D, 0xF0, 0x76, 0x1E, 0x8D, 0xCD, 0x3D, 0x88, 0xE5, 0x4C, 0x2A
+ , 0x76, 0xD4, 0x57, 0xED
+};
+#define A2_len sizeof(A2)
+static uint8_t C2[] = {
+ 0x13, 0xB4, 0xC7, 0x2B, 0x38, 0x9D, 0xC5, 0x01, 0x8E, 0x72, 0xA1, 0x71, 0xDD, 0x85, 0xA5, 0xD3
+ , 0x75, 0x22, 0x74, 0xD3, 0xA0, 0x19, 0xFB, 0xCA, 0xED, 0x09, 0xA4, 0x25, 0xCD, 0x9B, 0x2E, 0x1C
+ , 0x9B, 0x72, 0xEE, 0xE7, 0xC9, 0xDE, 0x7D, 0x52, 0xB3, 0xF3
+};
+static uint8_t T2[] = {
+ 0xD6, 0xA5, 0x28, 0x4F, 0x4A, 0x6D, 0x3F, 0xE2, 0x2A, 0x5D, 0x6C, 0x2B, 0x96, 0x04, 0x94, 0xC3
+};
+
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 128]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K: c939cc13397c1d37de6ae0e1cb7c423c
+// IV: b3d8cc017cbb89b39e0f67e2
+// P: c3b3c41f113a31b73d9a5cd432103069
+// AAD: 24825602bd12a984e0092d3e448eda5f
+// C: 93fe7d9e9bfd10348a5606e5cafa7354
+// AT: 0032a1dc85f1c9786925a2e71d8272dd
+///////
+static uint8_t K3[] = {0xc9, 0x39, 0xcc, 0x13, 0x39, 0x7c, 0x1d, 0x37, 0xde, 0x6a, 0xe0, 0xe1, 0xcb, 0x7c, 0x42, 0x3c};
+static uint8_t IV3[] = {0xb3, 0xd8, 0xcc, 0x01, 0x7c, 0xbb, 0x89, 0xb3, 0x9e, 0x0f, 0x67, 0xe2};
+static uint8_t P3[] = {0xc3, 0xb3, 0xc4, 0x1f, 0x11, 0x3a, 0x31, 0xb7, 0x3d, 0x9a, 0x5c, 0xd4, 0x32, 0x10, 0x30, 0x69};
+static uint8_t A3[] = {0x24, 0x82, 0x56, 0x02, 0xbd, 0x12, 0xa9, 0x84, 0xe0, 0x09, 0x2d, 0x3e, 0x44, 0x8e, 0xda, 0x5f};
+#define A3_len sizeof(A3)
+static uint8_t C3[] = {0x93, 0xfe, 0x7d, 0x9e, 0x9b, 0xfd, 0x10, 0x34, 0x8a, 0x56, 0x06, 0xe5, 0xca, 0xfa, 0x73, 0x54};
+static uint8_t T3[] = {0x00, 0x32, 0xa1, 0xdc, 0x85, 0xf1, 0xc9, 0x78, 0x69, 0x25, 0xa2, 0xe7, 0x1d, 0x82, 0x72, 0xdd};
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 256]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K = 298efa1ccf29cf62ae6824bfc19557fc
+// IV = 6f58a93fe1d207fae4ed2f6d
+// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901
+// AAD = 021fafd238463973ffe80256e5b1c6b1
+// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db
+// T = 542465ef599316f73a7a560509a2d9f2
+///////
+static uint8_t K4[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc};
+static uint8_t IV4[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d};
+static uint8_t P4[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01};
+static uint8_t A4[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1};
+#define A4_len sizeof(A4)
+static uint8_t C4[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb};
+static uint8_t T4[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2};
+
+///////
+// http://csrc.nist.gov/groups/STM/cavp/gcmtestvectors.zip gcmEncryptExtIV128.rsp
+// [Keylen = 128]
+// [IVlen = 96]
+// [PTlen = 256]
+// [AADlen = 128]
+// [Taglen = 128]
+// Count = 0
+// K = 298efa1ccf29cf62ae6824bfc19557fc
+// IV = 6f58a93fe1d207fae4ed2f6d
+// P = cc38bccd6bc536ad919b1395f5d63801f99f8068d65ca5ac63872daf16b93901
+// AAD = 021fafd238463973ffe80256e5b1c6b1
+// C = dfce4e9cd291103d7fe4e63351d9e79d3dfd391e3267104658212da96521b7db
+// T = 542465ef599316f73a7a560509a2d9f2
+///////
+static uint8_t K5[] = {0x29, 0x8e, 0xfa, 0x1c, 0xcf, 0x29, 0xcf, 0x62, 0xae, 0x68, 0x24, 0xbf, 0xc1, 0x95, 0x57, 0xfc};
+static uint8_t IV5[] = {0x6f, 0x58, 0xa9, 0x3f, 0xe1, 0xd2, 0x07, 0xfa, 0xe4, 0xed, 0x2f, 0x6d};
+static uint8_t P5[] = {0xcc, 0x38, 0xbc, 0xcd, 0x6b, 0xc5, 0x36, 0xad, 0x91, 0x9b, 0x13, 0x95, 0xf5, 0xd6, 0x38, 0x01, 0xf9, 0x9f, 0x80, 0x68, 0xd6, 0x5c, 0xa5, 0xac, 0x63, 0x87, 0x2d, 0xaf, 0x16, 0xb9, 0x39, 0x01};
+static uint8_t A5[] = {0x02, 0x1f, 0xaf, 0xd2, 0x38, 0x46, 0x39, 0x73, 0xff, 0xe8, 0x02, 0x56, 0xe5, 0xb1, 0xc6, 0xb1};
+#define A5_len sizeof(A5)
+static uint8_t C5[] = {0xdf, 0xce, 0x4e, 0x9c, 0xd2, 0x91, 0x10, 0x3d, 0x7f, 0xe4, 0xe6, 0x33, 0x51, 0xd9, 0xe7, 0x9d, 0x3d, 0xfd, 0x39, 0x1e, 0x32, 0x67, 0x10, 0x46, 0x58, 0x21, 0x2d, 0xa9, 0x65, 0x21, 0xb7, 0xdb};
+static uint8_t T5[] = {0x54, 0x24, 0x65, 0xef, 0x59, 0x93, 0x16, 0xf7, 0x3a, 0x7a, 0x56, 0x05, 0x09, 0xa2, 0xd9, 0xf2};
+
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 2
+// K: 00000000000000000000000000000000
+// P: 00000000000000000000000000000000
+// IV: 000000000000000000000000
+// C: 0388dace60b6a392f328c2b971b2fe78
+// T: ab6e47d42cec13bdf53a67b21257bddf
+// H: 66e94bd4ef8a2c3b884cfa59ca342b2e
+///////
+static uint8_t K6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t P6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t IV6[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+static uint8_t A6[] = {0};
+#define A6_len 0
+static uint8_t C6[] = {0x03, 0x88, 0xda, 0xce, 0x60, 0xb6, 0xa3, 0x92, 0xf3, 0x28, 0xc2, 0xb9, 0x71, 0xb2, 0xfe, 0x78};
+static uint8_t T6[] = {0xab, 0x6e, 0x47, 0xd4, 0x2c, 0xec, 0x13, 0xbd, 0xf5, 0x3a, 0x67, 0xb2, 0x12, 0x57, 0xbd, 0xdf};
+
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 3
+// K: feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b391aafd255
+// IV: cafebabefacedbaddecaf888
+// H: b83b533708bf535d0aa6e52980d53b78
+// C: 42831ec2217774244b7221b784d0d49c
+// e3aa212f2c02a4e035c17e2329aca12e
+// 21d514b25466931c7d8f6a5aac84aa05
+// 1ba30b396a0aac973d58e091473f5985
+// T: 4d5c2af327cd64a62cf35abd2ba6fab4
+///////
+static uint8_t K7[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P7[] = {0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a
+ , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72
+ , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25
+ , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55};
+static uint8_t IV7[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t A7[] = {0};
+#define A7_len 0
+static uint8_t C7[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c
+ , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e
+ , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05
+ , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85};
+static uint8_t T7[] = {0x4d, 0x5c, 0x2a, 0xf3, 0x27, 0xcd, 0x64, 0xa6, 0x2c, 0xf3, 0x5a, 0xbd, 0x2b, 0xa6, 0xfa, 0xb4};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 4
+// K: feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: cafebabefacedbaddecaf888
+// H: b83b533708bf535d0aa6e52980d53b78
+// C: 42831ec2217774244b7221b784d0d49c
+// e3aa212f2c02a4e035c17e2329aca12e
+// 21d514b25466931c7d8f6a5aac84aa05
+// 1ba30b396a0aac973d58e091
+// T: 5bc94fbc3221a5db94fae95ae7121a47
+///////
+static uint8_t K8[] = {0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P8[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a
+ , 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72
+ , 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25
+ , 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39};
+static uint8_t A8[] = {0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef
+ , 0xab, 0xad, 0xda, 0xd2};
+#define A8_len sizeof(A8)
+static uint8_t IV8[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C8[] = {0x42, 0x83, 0x1e, 0xc2, 0x21, 0x77, 0x74, 0x24, 0x4b, 0x72, 0x21, 0xb7, 0x84, 0xd0, 0xd4, 0x9c
+ , 0xe3, 0xaa, 0x21, 0x2f, 0x2c, 0x02, 0xa4, 0xe0, 0x35, 0xc1, 0x7e, 0x23, 0x29, 0xac, 0xa1, 0x2e
+ , 0x21, 0xd5, 0x14, 0xb2, 0x54, 0x66, 0x93, 0x1c, 0x7d, 0x8f, 0x6a, 0x5a, 0xac, 0x84, 0xaa, 0x05
+ , 0x1b, 0xa3, 0x0b, 0x39, 0x6a, 0x0a, 0xac, 0x97, 0x3d, 0x58, 0xe0, 0x91, 0x47, 0x3f, 0x59, 0x85};
+static uint8_t T8[] = {0x5b, 0xc9, 0x4f, 0xbc, 0x32, 0x21, 0xa5, 0xdb, 0x94, 0xfa, 0xe9, 0x5a, 0xe7, 0x12, 0x1a, 0x47};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 14
+// K: 00000000000000000000000000000000
+// 00000000000000000000000000000000
+// P: 00000000000000000000000000000000
+// A:
+// IV: 000000000000000000000000
+// H: dc95c078a2408989ad48a21492842087
+// C: cea7403d4d606b6e074ec5d3baf39d18
+// T: d0d1c8a799996bf0265b98b5d48ab919
+///////
+static uint8_t K9[] = {
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static uint8_t P9[] = {
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+};
+static uint8_t A9[] = {0};
+#define A9_len 0
+static uint8_t IV9[] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+static uint8_t C9[] = {
+ 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18
+};
+static uint8_t T9[] = {0xd0, 0xd1, 0xc8, 0xa7, 0x99, 0x99, 0x6b, 0xf0, 0x26, 0x5b, 0x98, 0xb5, 0xd4, 0x8a, 0xb9, 0x19};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 15
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b391aafd255
+// A:
+// IV: cafebabefacedbaddecaf888
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: 522dc1f099567d07f47f37a32a84427d
+// 643a8cdcbfe5c0c97598a2bd2555d1aa
+// 8cb08e48590dbb3da7b08b1056828838
+// c5f61e6393ba7a0abcc9f662898015ad
+// T: b094dac5d93471bdec1a502270e3cc6c
+///////
+static uint8_t K10[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P10[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39, 0x1a, 0xaf, 0xd2, 0x55
+};
+static uint8_t A10[] = {0};
+#define A10_len 0
+static uint8_t IV10[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C10[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62, 0x89, 0x80, 0x15, 0xad
+};
+static uint8_t T10[] = {
+ 0xb0, 0x94, 0xda, 0xc5, 0xd9, 0x34, 0x71, 0xbd, 0xec, 0x1a, 0x50, 0x22, 0x70, 0xe3, 0xcc, 0x6c};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 16
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: cafebabefacedbaddecaf888
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: 522dc1f099567d07f47f37a32a84427d
+// 643a8cdcbfe5c0c97598a2bd2555d1aa
+// 8cb08e48590dbb3da7b08b1056828838
+// c5f61e6393ba7a0abcc9f662
+// T: 76fc6ece0f4e1768cddf8853bb2d551b
+///////
+static uint8_t K11[] = {
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+ 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+static uint8_t P11[] = {
+ 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+ 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+ 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+ 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39
+};
+static uint8_t A11[] = {
+ 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+ 0xab, 0xad, 0xda, 0xd2};
+#define A11_len sizeof(A11)
+static uint8_t IV11[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad, 0xde, 0xca, 0xf8, 0x88};
+static uint8_t C11[] = {
+ 0x52, 0x2d, 0xc1, 0xf0, 0x99, 0x56, 0x7d, 0x07, 0xf4, 0x7f, 0x37, 0xa3, 0x2a, 0x84, 0x42, 0x7d,
+ 0x64, 0x3a, 0x8c, 0xdc, 0xbf, 0xe5, 0xc0, 0xc9, 0x75, 0x98, 0xa2, 0xbd, 0x25, 0x55, 0xd1, 0xaa,
+ 0x8c, 0xb0, 0x8e, 0x48, 0x59, 0x0d, 0xbb, 0x3d, 0xa7, 0xb0, 0x8b, 0x10, 0x56, 0x82, 0x88, 0x38,
+ 0xc5, 0xf6, 0x1e, 0x63, 0x93, 0xba, 0x7a, 0x0a, 0xbc, 0xc9, 0xf6, 0x62
+};
+static uint8_t T11[] = {0x76, 0xfc, 0x6e, 0xce, 0x0f, 0x4e, 0x17, 0x68, 0xcd, 0xdf, 0x88, 0x53, 0xbb, 0x2d, 0x55, 0x1b};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 17 -- Not supported IV length less than 12 bytes
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: cafebabefacedbad
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: c3762df1ca787d32ae47c13bf19844cb
+// af1ae14d0b976afac52ff7d79bba9de0
+// feb582d33934a4f0954cc2363bc73f78
+// 62ac430e64abe499f47c9b1f
+// T: 3a337dbf46a792c45e454913fe2ea8f2
+///////
+//static uint8_t K12[] = {
+// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08,
+// 0xfe, 0xff, 0xe9, 0x92, 0x86, 0x65, 0x73, 0x1c, 0x6d, 0x6a, 0x8f, 0x94, 0x67, 0x30, 0x83, 0x08};
+//static uint8_t P12[] = {
+// 0xd9, 0x31, 0x32, 0x25, 0xf8, 0x84, 0x06, 0xe5, 0xa5, 0x59, 0x09, 0xc5, 0xaf, 0xf5, 0x26, 0x9a,
+// 0x86, 0xa7, 0xa9, 0x53, 0x15, 0x34, 0xf7, 0xda, 0x2e, 0x4c, 0x30, 0x3d, 0x8a, 0x31, 0x8a, 0x72,
+// 0x1c, 0x3c, 0x0c, 0x95, 0x95, 0x68, 0x09, 0x53, 0x2f, 0xcf, 0x0e, 0x24, 0x49, 0xa6, 0xb5, 0x25,
+// 0xb1, 0x6a, 0xed, 0xf5, 0xaa, 0x0d, 0xe6, 0x57, 0xba, 0x63, 0x7b, 0x39
+//};
+//static uint8_t A12[] = {
+// 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef, 0xfe, 0xed, 0xfa, 0xce, 0xde, 0xad, 0xbe, 0xef,
+// 0xab, 0xad, 0xda, 0xd2};
+//static uint8_t IV12[] = {0xca, 0xfe, 0xba, 0xbe, 0xfa, 0xce, 0xdb, 0xad};
+//static uint8_t H12[] = {
+// 0xac, 0xbe, 0xf2, 0x05, 0x79, 0xb4, 0xb8, 0xeb, 0xce, 0x88, 0x9b, 0xac, 0x87, 0x32, 0xda, 0xd7};
+//static uint8_t C12[] = {
+// 0xc3, 0x76, 0x2d, 0xf1, 0xca, 0x78, 0x7d, 0x32, 0xae, 0x47, 0xc1, 0x3b, 0xf1, 0x98, 0x44, 0xcb,
+// 0xaf, 0x1a, 0xe1, 0x4d, 0x0b, 0x97, 0x6a, 0xfa, 0xc5, 0x2f, 0xf7, 0xd7, 0x9b, 0xba, 0x9d, 0xe0,
+// 0xfe, 0xb5, 0x82, 0xd3, 0x39, 0x34, 0xa4, 0xf0, 0x95, 0x4c, 0xc2, 0x36, 0x3b, 0xc7, 0x3f, 0x78,
+// 0x62, 0xac, 0x43, 0x0e, 0x64, 0xab, 0xe4, 0x99, 0xf4, 0x7c, 0x9b, 0x1f
+//};
+//static uint8_t T12[] = {
+// 0x3a, 0x33, 0x7d, 0xbf, 0x46, 0xa7, 0x92, 0xc4, 0x5e, 0x45, 0x49, 0x13, 0xfe, 0x2e, 0xa8, 0xf2};
+
+///////
+// http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/gcm/gcm-revised-spec.pdf
+// Test Case 18 -- Not supported IV length greater than 12 bytes
+// K: feffe9928665731c6d6a8f9467308308
+// feffe9928665731c6d6a8f9467308308
+// P: d9313225f88406e5a55909c5aff5269a
+// 86a7a9531534f7da2e4c303d8a318a72
+// 1c3c0c95956809532fcf0e2449a6b525
+// b16aedf5aa0de657ba637b39
+// A: feedfacedeadbeeffeedfacedeadbeef
+// abaddad2
+// IV: 9313225df88406e555909c5aff5269aa
+// 6a7a9538534f7da1e4c303d2a318a728
+// c3c0c95156809539fcf0e2429a6b5254
+// 16aedbf5a0de6a57a637b39b
+// H: acbef20579b4b8ebce889bac8732dad7
+// C: 5a8def2f0c9e53f1f75d7853659e2a20
+// eeb2b22aafde6419a058ab4f6f746bf4
+// 0fc0c3b780f244452da3ebf1c5d82cde
+// a2418997200ef82e44ae7e3f
+// T: a44a8266ee1c8eb0c8b5d4cf5ae9f19a
+///////
+
+
+#define vector(N) {K##N, (KBITS(K##N)), IV##N, sizeof(IV##N), A##N, A##N##_len, P##N, sizeof(P##N), C##N, T##N, sizeof(T##N)}
+
+gcm_vector const gcm_vectors[] = {
+ //field order {K, Klen, IV, IVlen, A, Alen, P, Plen, C, T, Tlen};
+ // original vector does not have a valid sub hash key
+ vector(1),
+ vector(2),
+ vector(3),
+ vector(4),
+ vector(5),
+ vector(6),
+ vector(7),
+ vector(8),
+ vector(9),
+ vector(10),
+ vector(11),
+ /* vector(12), -- IV of less than 16bytes are not supported */
+};
+
+#endif /* AES_GCM_VECTORS_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm
new file mode 100644
index 000000000..9f7a13077
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_128.asm
@@ -0,0 +1,320 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Routine to do AES key expansion
+
+%include "reg_sizes.asm"
+
+%macro key_expansion_128_sse 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+%endmacro
+
+%macro key_expansion_128_avx 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 11111111b
+ vshufps xmm3, xmm3, xmm1, 00010000b
+ vpxor xmm1, xmm1, xmm3
+ vshufps xmm3, xmm3, xmm1, 10001100b
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm2
+%endmacro
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY rdi
+%define EXP_ENC_KEYS rsi
+%define EXP_DEC_KEYS rdx
+%else
+%define KEY rcx
+%define EXP_ENC_KEYS rdx
+%define EXP_DEC_KEYS r8
+%endif
+
+
+; void aes_keyexp_128(UINT8 *key,
+; UINT8 *enc_exp_keys,
+; UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8: pointer to expanded key array for decrypt
+;
+global aes_keyexp_128_sse:function
+aes_keyexp_128_sse:
+ movdqu xmm1, [KEY] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*0], xmm1
+ movdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory
+ pxor xmm3, xmm3
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*1], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*9], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*2], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*3], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*7], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*4], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*5], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*5], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*6], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*7], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*3], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*8], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*9], xmm1
+ aesimc xmm4, xmm1
+ movdqu [EXP_DEC_KEYS + 16*1], xmm4
+
+ aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*10], xmm1
+ movdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+global aes_keyexp_128_avx:function
+aes_keyexp_128_avx:
+ vmovdqu xmm1, [KEY] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*0], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*10], xmm1 ; Storing key in memory
+ vpxor xmm3, xmm3, xmm3
+
+ vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*1], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*9], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*2], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*3], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*7], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*4], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*5], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*5], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*6], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*7], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*3], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*8], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*9], xmm1
+ vaesimc xmm4, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*1], xmm4
+
+ vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*10], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; void aes_keyexp_128_enc_sse(UINT8 *key,
+; UINT8 *enc_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+;
+global aes_keyexp_128_enc_sse:function
+aes_keyexp_128_enc_sse:
+ movdqu xmm1, [KEY] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*0], xmm1
+ pxor xmm3, xmm3
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*1], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*2], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*3], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*4], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*5], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*6], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*7], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*8], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*9], xmm1
+
+ aeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_sse
+ movdqu [EXP_ENC_KEYS + 16*10], xmm1
+
+ ret
+
+global aes_keyexp_128_enc_avx:function
+aes_keyexp_128_enc_avx:
+ vmovdqu xmm1, [KEY] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*0], xmm1
+ vpxor xmm3, xmm3, xmm3
+
+ vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 1
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*1], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 2
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*2], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 3
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*3], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 4
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*4], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 5
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*5], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 6
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*6], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x40 ; Generating round key 7
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*7], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x80 ; Generating round key 8
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*8], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x1b ; Generating round key 9
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*9], xmm1
+
+ vaeskeygenassist xmm2, xmm1, 0x36 ; Generating round key 10
+ key_expansion_128_avx
+ vmovdqu [EXP_ENC_KEYS + 16*10], xmm1
+
+ ret
+
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm
new file mode 100644
index 000000000..33fcef83e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_192.asm
@@ -0,0 +1,268 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY rdi
+%define EXP_ENC_KEYS rsi
+%define EXP_DEC_KEYS rdx
+%else
+%define KEY rcx
+%define EXP_ENC_KEYS rdx
+%define EXP_DEC_KEYS r8
+%endif
+
+
+
+
+%macro key_expansion_1_192_sse 1
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+ movdqu [EXP_ENC_KEYS+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192_sse 1
+ movdqu xmm5, xmm4
+ pslldq xmm5, 4
+ shufps xmm6, xmm1, 11110000b
+ pxor xmm6, xmm5
+ pxor xmm4, xmm6
+ pshufd xmm7, xmm4, 00001110b
+ movdqu [EXP_ENC_KEYS+%1], xmm7
+%endmacro
+
+%macro key_dec_192_sse 1
+ movdqu xmm0, [EXP_ENC_KEYS + 16 * %1]
+ aesimc xmm1, xmm0
+ movdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1
+%endmacro
+
+
+
+
+
+%macro key_expansion_1_192_avx 1
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 11111111b
+ vshufps xmm3, xmm3, xmm1, 00010000b
+ vpxor xmm1, xmm1, xmm3
+ vshufps xmm3, xmm3, xmm1, 10001100b
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vmovdqu [EXP_ENC_KEYS+%1], xmm1
+%endmacro
+
+; Calculate w10 and w11 using calculated w9 and known w4-w5
+%macro key_expansion_2_192_avx 1
+ vmovdqa xmm5, xmm4
+ vpslldq xmm5, xmm5, 4
+ vshufps xmm6, xmm6, xmm1, 11110000b
+ vpxor xmm6, xmm6, xmm5
+ vpxor xmm4, xmm4, xmm6
+ vpshufd xmm7, xmm4, 00001110b
+ vmovdqu [EXP_ENC_KEYS+%1], xmm7
+%endmacro
+
+%macro key_dec_192_avx 1
+ vmovdqu xmm0, [EXP_ENC_KEYS + 16 * %1]
+ vaesimc xmm1, xmm0
+ vmovdqu [EXP_DEC_KEYS + 16 * (12 - %1)], xmm1
+%endmacro
+
+
+
+
+; void aes_keyexp_192(UINT8 *key,
+; UINT8 *enc_exp_keys,
+; UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8: pointer to expanded key array for decrypt
+;
+global aes_keyexp_192_sse:function
+aes_keyexp_192_sse:
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ sub rsp, 16*2 + 8
+ movdqu [rsp + 0*16], xmm6
+ movdqu [rsp + 1*16], xmm7
+%endif
+
+ movq xmm7, [KEY + 16] ; loading the AES key, 64 bits
+ movq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion
+ pshufd xmm4, xmm7, 01001111b
+ movdqu xmm1, [KEY] ; loading the AES key, 128 bits
+ movdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion
+ movdqu [EXP_DEC_KEYS + 16*0], xmm1
+ movdqu [EXP_DEC_KEYS + 16*12], xmm1
+
+ pxor xmm3, xmm3 ; Set xmm3 to be all zeros. Required for the key_expansion.
+ pxor xmm6, xmm6 ; Set xmm3 to be all zeros. Required for the key_expansion.
+
+ aeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2
+ key_expansion_1_192_sse 24
+ key_expansion_2_192_sse 40
+
+ aeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4
+ key_expansion_1_192_sse 48
+ key_expansion_2_192_sse 64
+
+ aeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5
+ key_expansion_1_192_sse 72
+ key_expansion_2_192_sse 88
+
+ aeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7
+ key_expansion_1_192_sse 96
+ key_expansion_2_192_sse 112
+
+ aeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8
+ key_expansion_1_192_sse 120
+ key_expansion_2_192_sse 136
+
+ aeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10
+ key_expansion_1_192_sse 144
+ key_expansion_2_192_sse 160
+
+ aeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11
+ key_expansion_1_192_sse 168
+ key_expansion_2_192_sse 184
+
+ aeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12
+ key_expansion_1_192_sse 192
+
+;;; we have already saved the 12 th key, which is pure input on the
+;;; ENC key path
+ movdqu xmm0, [EXP_ENC_KEYS + 16 * 12]
+ movdqu [EXP_DEC_KEYS + 16*0], xmm0
+;;; generate remaining decrypt keys
+ key_dec_192_sse 1
+ key_dec_192_sse 2
+ key_dec_192_sse 3
+ key_dec_192_sse 4
+ key_dec_192_sse 5
+ key_dec_192_sse 6
+ key_dec_192_sse 7
+ key_dec_192_sse 8
+ key_dec_192_sse 9
+ key_dec_192_sse 10
+ key_dec_192_sse 11
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ movdqu xmm6, [rsp + 0*16]
+ movdqu xmm7, [rsp + 1*16]
+ add rsp, 16*2 + 8
+%endif
+
+ ret
+
+
+
+global aes_keyexp_192_avx:function
+aes_keyexp_192_avx:
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ sub rsp, 16*2 + 8
+ vmovdqu [rsp + 0*16], xmm6
+ vmovdqu [rsp + 1*16], xmm7
+%endif
+
+ vmovq xmm7, [KEY + 16] ; loading the AES key, 64 bits
+ vmovq [EXP_ENC_KEYS + 16], xmm7 ; Storing key in memory where all key expansion
+ vpshufd xmm4, xmm7, 01001111b
+ vmovdqu xmm1, [KEY] ; loading the AES key, 128 bits
+ vmovdqu [EXP_ENC_KEYS], xmm1 ; Storing key in memory where all key expansion
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*12], xmm1
+
+ vpxor xmm3, xmm3, xmm3
+ vpxor xmm6, xmm6, xmm6
+
+ vaeskeygenassist xmm2, xmm4, 0x1 ; Complete round key 1 and generate round key 2
+ key_expansion_1_192_avx 24
+ key_expansion_2_192_avx 40
+
+ vaeskeygenassist xmm2, xmm4, 0x2 ; Generate round key 3 and part of round key 4
+ key_expansion_1_192_avx 48
+ key_expansion_2_192_avx 64
+
+ vaeskeygenassist xmm2, xmm4, 0x4 ; Complete round key 4 and generate round key 5
+ key_expansion_1_192_avx 72
+ key_expansion_2_192_avx 88
+
+ vaeskeygenassist xmm2, xmm4, 0x8 ; Generate round key 6 and part of round key 7
+ key_expansion_1_192_avx 96
+ key_expansion_2_192_avx 112
+
+ vaeskeygenassist xmm2, xmm4, 0x10 ; Complete round key 7 and generate round key 8
+ key_expansion_1_192_avx 120
+ key_expansion_2_192_avx 136
+
+ vaeskeygenassist xmm2, xmm4, 0x20 ; Generate round key 9 and part of round key 10
+ key_expansion_1_192_avx 144
+ key_expansion_2_192_avx 160
+
+ vaeskeygenassist xmm2, xmm4, 0x40 ; Complete round key 10 and generate round key 11
+ key_expansion_1_192_avx 168
+ key_expansion_2_192_avx 184
+
+ vaeskeygenassist xmm2, xmm4, 0x80 ; Generate round key 12
+ key_expansion_1_192_avx 192
+
+;;; we have already saved the 12 th key, which is pure input on the
+;;; ENC key path
+ vmovdqu xmm0, [EXP_ENC_KEYS + 16 * 12]
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm0
+;;; generate remaining decrypt keys
+ key_dec_192_avx 1
+ key_dec_192_avx 2
+ key_dec_192_avx 3
+ key_dec_192_avx 4
+ key_dec_192_avx 5
+ key_dec_192_avx 6
+ key_dec_192_avx 7
+ key_dec_192_avx 8
+ key_dec_192_avx 9
+ key_dec_192_avx 10
+ key_dec_192_avx 11
+
+%ifnidn __OUTPUT_FORMAT__, elf64
+ vmovdqu xmm6, [rsp + 0*16]
+ vmovdqu xmm7, [rsp + 1*16]
+ add rsp, 16*2 + 8
+%endif
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm
new file mode 100644
index 000000000..ff08cd0f9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_256.asm
@@ -0,0 +1,280 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+; Routine to do AES key expansion
+
+%include "reg_sizes.asm"
+
+; Uses the f() function of the aeskeygenassist result
+%macro key_expansion_256_sse 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 11111111b
+ shufps xmm3, xmm1, 00010000b
+ pxor xmm1, xmm3
+ shufps xmm3, xmm1, 10001100b
+ pxor xmm1, xmm3
+ pxor xmm1, xmm2
+%endmacro
+
+; Uses the SubWord function of the aeskeygenassist result
+%macro key_expansion_256_sse_2 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ pshufd xmm2, xmm2, 10101010b
+ shufps xmm3, xmm4, 00010000b
+ pxor xmm4, xmm3
+ shufps xmm3, xmm4, 10001100b
+ pxor xmm4, xmm3
+ pxor xmm4, xmm2
+%endmacro
+
+; Uses the f() function of the aeskeygenassist result
+%macro key_expansion_256_avx 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 11111111b
+ vshufps xmm3, xmm3, xmm1, 00010000b
+ vpxor xmm1, xmm1, xmm3
+ vshufps xmm3, xmm3, xmm1, 10001100b
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm1, xmm1, xmm2
+%endmacro
+
+; Uses the SubWord function of the aeskeygenassist result
+%macro key_expansion_256_avx_2 0
+ ;; Assumes the xmm3 includes all zeros at this point.
+ vpshufd xmm2, xmm2, 10101010b
+ vshufps xmm3, xmm3, xmm4, 00010000b
+ vpxor xmm4, xmm4, xmm3
+ vshufps xmm3, xmm3, xmm4, 10001100b
+ vpxor xmm4, xmm4, xmm3
+ vpxor xmm4, xmm4, xmm2
+%endmacro
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define KEY rdi
+%define EXP_ENC_KEYS rsi
+%define EXP_DEC_KEYS rdx
+%else
+%define KEY rcx
+%define EXP_ENC_KEYS rdx
+%define EXP_DEC_KEYS r8
+%endif
+
+; void aes_keyexp_256(UINT8 *key,
+; UINT8 *enc_exp_keys,
+; UINT8 *dec_exp_keys);
+;
+; arg 1: rcx: pointer to key
+; arg 2: rdx: pointer to expanded key array for encrypt
+; arg 3: r8: pointer to expanded key array for decrypt
+;
+global aes_keyexp_256_sse:function
+aes_keyexp_256_sse:
+ movdqu xmm1, [KEY] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*0], xmm1
+ movdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory
+
+ movdqu xmm4, [KEY+16] ; loading the AES key
+ movdqu [EXP_ENC_KEYS + 16*1], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory
+
+ pxor xmm3, xmm3 ; Required for the key_expansion.
+
+ aeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*2], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*12], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*3], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*11], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*4], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*10], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*5], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*9], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*6], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*7], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*7], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*8], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*9], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*5], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*10], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*11], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*3], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*12], xmm1
+ aesimc xmm5, xmm1
+ movdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ aeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13
+ key_expansion_256_sse_2
+ movdqu [EXP_ENC_KEYS + 16*13], xmm4
+ aesimc xmm0, xmm4
+ movdqu [EXP_DEC_KEYS + 16*1], xmm0
+
+ aeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14
+ key_expansion_256_sse
+ movdqu [EXP_ENC_KEYS + 16*14], xmm1
+ movdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
+
+
+global aes_keyexp_256_avx:function
+aes_keyexp_256_avx:
+ vmovdqu xmm1, [KEY] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*0], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*14], xmm1 ; Storing key in memory
+
+ vmovdqu xmm4, [KEY+16] ; loading the AES key
+ vmovdqu [EXP_ENC_KEYS + 16*1], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*13], xmm0 ; Storing key in memory
+
+ vpxor xmm3, xmm3, xmm3 ; Required for the key_expansion.
+
+ vaeskeygenassist xmm2, xmm4, 0x1 ; Generating round key 2
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*2], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*12], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x1 ; Generating round key 3
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*3], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*11], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x2 ; Generating round key 4
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*4], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*10], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x2 ; Generating round key 5
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*5], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*9], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x4 ; Generating round key 6
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*6], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*8], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x4 ; Generating round key 7
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*7], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*7], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x8 ; Generating round key 8
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*8], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*6], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x8 ; Generating round key 9
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*9], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*5], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x10 ; Generating round key 10
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*10], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*4], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x10 ; Generating round key 11
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*11], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*3], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x20 ; Generating round key 12
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*12], xmm1
+ vaesimc xmm5, xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*2], xmm5
+
+ vaeskeygenassist xmm2, xmm1, 0x20 ; Generating round key 13
+ key_expansion_256_avx_2
+ vmovdqu [EXP_ENC_KEYS + 16*13], xmm4
+ vaesimc xmm0, xmm4
+ vmovdqu [EXP_DEC_KEYS + 16*1], xmm0
+
+ vaeskeygenassist xmm2, xmm4, 0x40 ; Generating round key 14
+ key_expansion_256_avx
+ vmovdqu [EXP_ENC_KEYS + 16*14], xmm1
+ vmovdqu [EXP_DEC_KEYS + 16*0], xmm1
+
+ ret
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm
new file mode 100644
index 000000000..698ea4b64
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/keyexp_multibinary.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern aes_keyexp_128_sse
+extern aes_keyexp_128_avx
+extern aes_keyexp_128_enc_sse
+extern aes_keyexp_128_enc_avx
+
+extern aes_keyexp_192_sse
+extern aes_keyexp_192_avx
+
+extern aes_keyexp_256_sse
+extern aes_keyexp_256_avx
+
+%include "multibinary.asm"
+
+
+;;;;
+; instantiate aes_keyexp_128 interfaces
+;;;;
+mbin_interface aes_keyexp_128
+mbin_dispatch_init aes_keyexp_128, aes_keyexp_128_sse, aes_keyexp_128_avx, aes_keyexp_128_avx
+
+mbin_interface aes_keyexp_128_enc
+mbin_dispatch_init aes_keyexp_128_enc, aes_keyexp_128_enc_sse, aes_keyexp_128_enc_avx, aes_keyexp_128_enc_avx
+
+mbin_interface aes_keyexp_192
+mbin_dispatch_init aes_keyexp_192, aes_keyexp_192_sse, aes_keyexp_192_avx, aes_keyexp_192_avx
+
+mbin_interface aes_keyexp_256
+mbin_dispatch_init aes_keyexp_256, aes_keyexp_256_sse, aes_keyexp_256_avx, aes_keyexp_256_avx
+
+section .text
+;;; func core, ver, snum
+slversion aes_keyexp_128, 00, 01, 02a1
+slversion aes_keyexp_192, 00, 01, 02a2
+slversion aes_keyexp_256, 00, 01, 02a3
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h
new file mode 100644
index 000000000..db71ebb80
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/ossl_helper.h
@@ -0,0 +1,300 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef AES_OSSL_HELPER_H_
+#define AES_OSSL_HELPER_H_
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+#include <openssl/evp.h>
+
+static inline
+ int openssl_aes_128_cbc_dec(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = 0;
+ EVP_CIPHER_CTX ctx;
+
+ EVP_CIPHER_CTX_init(&ctx);
+ if (!EVP_DecryptInit_ex(&ctx, EVP_aes_128_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(&ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_DecryptUpdate(&ctx, plaintext, &outlen, (uint8_t const *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_DecryptFinal_ex(&ctx, &plaintext[outlen], &tmplen))
+ printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_128_cbc %x, %x, %x\n", len,
+ outlen, tmplen);
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_128_cbc_enc(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX ctx;
+
+ EVP_CIPHER_CTX_init(&ctx);
+ if (!EVP_EncryptInit_ex(&ctx, EVP_aes_128_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(&ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_EncryptUpdate
+ (&ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_EncryptFinal_ex(&ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_192_cbc_dec(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = 0;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_192_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_192_cbc\n");
+ if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_192_cbc \n");
+
+ return 0;
+}
+
+static inline
+ int openssl_aes_192_cbc_enc(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_192_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_192_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_EncryptUpdate
+ (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_192_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_192_cbc\n");
+
+ return 0;
+}
+
+static inline
+ int openssl_aes_256_cbc_dec(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * cyphertext, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = 0;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_256_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - EVP_aes_256_cbc\n");
+ if (!EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_DecryptFinal_ex - EVP_aes_256_cbc %x,%x\n", outlen,
+ tmplen);
+
+ return 0;
+}
+
+static inline
+ int openssl_aes_256_cbc_enc(uint8_t * key, uint8_t * iv,
+ int len, uint8_t * plaintext, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_cbc(), NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_256_cbc\n");
+ if (!EVP_CIPHER_CTX_set_padding(ctx, 0))
+ printf("\n ERROR!! EVP_CIPHER_CTX_set_padding - no padding\n");
+ if (!EVP_EncryptUpdate
+ (ctx, cyphertext, &outlen, (const unsigned char *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_256_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_256_cbc\n");
+
+ return 0;
+}
+
+static inline
+ int openssl_aes_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext,
+ int len, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = len, ret;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *const ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - key init\n");
+ if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+
+ ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen);
+ if (0 < ret) {
+ tmplen += outlen;
+ } else {
+ //Authentication failed mismatched key, ADD or tag
+ tmplen = -1;
+ }
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext,
+ int len, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *const ctx = &CTX;
+
+ //printf("ivl:%x addl:%x tagl:%x ptl:%x\n", iv_len, aad_len, tag_len, len);
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - init\n");
+ if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n");
+ if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n");
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_256_gcm_dec(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * cyphertext,
+ int len, uint8_t * plaintext)
+{
+ int outlen = 0, tmplen = len, ret;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *const ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_DecryptInit_ex - EVP_aes_128_gcm\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_DecryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_DecryptInit_ex - key init\n");
+ if (!EVP_DecryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_DecryptUpdate - aad data setup\n");
+ if (!EVP_DecryptUpdate
+ (ctx, plaintext, &outlen, (const unsigned char *)cyphertext, len))
+ printf("\n ERROR!! EVP_DecryptUpdate - PT->CT\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - set tag\n");
+ ret = EVP_DecryptFinal_ex(ctx, plaintext + outlen, &tmplen);
+ if (0 < ret) {
+ tmplen += outlen;
+ } else {
+ //Authentication failed mismatched key, ADD or tag
+ tmplen = -1;
+ }
+
+ return tmplen;
+}
+
+static inline
+ int openssl_aes_256_gcm_enc(uint8_t * key, uint8_t * iv, int iv_len, uint8_t * aad,
+ int aad_len, uint8_t * tag, int tag_len, uint8_t * plaintext,
+ int len, uint8_t * cyphertext)
+{
+ int outlen, tmplen;
+ EVP_CIPHER_CTX CTX;
+ EVP_CIPHER_CTX *const ctx = &CTX;
+
+ EVP_CIPHER_CTX_init(ctx);
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_gcm(), NULL, NULL, NULL))
+ printf("\n ERROR!! EVP_EncryptInit_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_SET_IVLEN, iv_len, NULL))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - IV length init\n");
+ if (!EVP_EncryptInit_ex(ctx, NULL, NULL, key, iv))
+ printf("\n ERROR!! EVP_EncryptInit_ex - init\n");
+ if (!EVP_EncryptUpdate(ctx, NULL, &outlen, aad, aad_len))
+ printf("\n ERROR!! EVP_EncryptUpdate - aad insert\n");
+ if (!EVP_EncryptUpdate(ctx, cyphertext, &outlen, (const uint8_t *)plaintext, len))
+ printf("\n ERROR!! EVP_EncryptUpdate - EVP_aes_128_cbc\n");
+ if (!EVP_EncryptFinal_ex(ctx, cyphertext + outlen, &tmplen))
+ printf("\n ERROR!! EVP_EncryptFinal_ex - EVP_aes_128_cbc\n");
+ if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_GCM_GET_TAG, tag_len, tag))
+ printf("\n ERROR!! EVP_CIPHER_CTX_ctrl - tag \n");
+
+ return tmplen;
+}
+
+#endif /* AES_OSSL_HELPER_H_ */
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c
new file mode 100644
index 000000000..bfde35570
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_ossl_perf.c
@@ -0,0 +1,141 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ unsigned char *ct, unsigned char *dt)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, TEST_LEN))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt, *refdt;
+ unsigned char keyssl[32]; /* SSL takes both keys together */
+ struct perf start, stop;
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ printf("aes_xts_128_dec_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 16; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 16] = key2[i];
+ }
+
+ /* Encrypt and compare decrypted output */
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_128_xts_dec(&ctx, keyssl, tinit, ct, refdt);
+ if (memcmp(dt, refdt, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ perf_stop(&stop);
+ printf("aes_xts_128_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_128_xts_dec(&ctx, keyssl, tinit, ct, refdt);
+ perf_stop(&stop);
+ printf("aes_xts_128_openssl_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c
new file mode 100644
index 000000000..5f41f0645
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_dec_perf.c
@@ -0,0 +1,102 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ printf("aes_xts_128_dec_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_128_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c
new file mode 100644
index 000000000..7c6a445e3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_ossl_perf.c
@@ -0,0 +1,142 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts128_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *refct;
+ struct perf start, stop;
+ unsigned char keyssl[32]; /* SSL takes both keys together */
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ printf("aes_xts_128_enc_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == refct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts128_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 16; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 16] = key2[i];
+ }
+
+ /* Encrypt and compare output */
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_128_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ if (memcmp(ct, refct, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ perf_stop(&stop);
+
+ printf("aes_xts_128_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_128_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ perf_stop(&stop);
+
+ printf("aes_xts_128_openssl_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c
new file mode 100644
index 000000000..1fce1665c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_enc_perf.c
@@ -0,0 +1,100 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct;
+
+ printf("aes_xts_128_enc_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_128_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c
new file mode 100644
index 000000000..3b294ef70
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_expanded_key_test.c
@@ -0,0 +1,116 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <aes_keyexp.h>
+#include "xts_128_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+ // Arrays for expanded keys, null_key is a dummy vector (decrypt key not
+ // needed for the tweak part of the decryption)
+ uint8_t expkey1_enc[16 * 11], expkey2_enc[16 * 11];
+ uint8_t expkey1_dec[16 * 11], null_key[16 * 11];
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ printf("Can't allocate ciphertext memory\n");
+ return -1;
+ }
+ // Pre-expand keys (will only use the encryption ones here)
+ aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key);
+
+ XTS_AES_128_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ // Vectors 1-10 and 15-19 are for the 128 bit code
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ printf("Can't allocate plaintext memory\n");
+ return -1;
+ }
+ // Pre-expand keys for the decryption
+ aes_keyexp_128(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_128(vlist[i].key2, expkey2_enc, null_key);
+
+ // Note, encryption key is re-used for the tweak decryption step
+ XTS_AES_128_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+ printf(" failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c
new file mode 100644
index 000000000..34498082d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand.c
@@ -0,0 +1,247 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_xts.h>
+#include <aes_keyexp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_SIZE (4096)
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int t, n;
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ int align, size, min_size;
+ unsigned char *efence_pt;
+ unsigned char *efence_ct;
+ unsigned char *efence_dt;
+
+ unsigned char *origin_pt;
+ unsigned char *origin_ct;
+ unsigned char *origin_dt;
+
+ unsigned char key1_exp_enc[16 * 11], key1_exp_dec[16 * 11];
+ unsigned char key2_exp_tw[16 * 11];
+ int i;
+
+ printf("aes_xts_128 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN);
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ if (memcmp(pt, dt, TEST_LEN)) {
+ printf("fail\n");
+ return -1;
+ }
+ putchar('.');
+
+ // Do tests with random data, keys and message size
+ for (t = 0; t < RANDOMS; t++) {
+ n = rand() % (TEST_LEN);
+ if (n < 17)
+ continue;
+
+ mk_rand_data(key1, key2, tinit, pt, n);
+ XTS_AES_128_enc(key2, key1, tinit, n, pt, ct);
+ XTS_AES_128_dec(key2, key1, tinit, n, ct, dt);
+
+ if (memcmp(pt, dt, n)) {
+ printf("fail rand %d, size %d\n", t, n);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // Run tests at end of buffer for Electric Fence
+ align = 1;
+ min_size = 16;
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ origin_pt = malloc(TEST_LEN);
+ origin_ct = malloc(TEST_LEN);
+ origin_dt = malloc(TEST_LEN);
+ if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ XTS_AES_128_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ for (i = 0; i < 16 * 11; i++) {
+ key2_exp_tw[i] = rand();
+ }
+
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ aes_keyexp_128(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_128_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_128_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c
new file mode 100644
index 000000000..aaed9347d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_rand_ossl_test.c
@@ -0,0 +1,207 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+#include <stdlib.h>
+#include <openssl/evp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_LOOPS 100
+#ifndef RANDOMS
+# define RANDOMS 100
+#endif
+
+/* Generates random data for keys, tweak and plaintext */
+void mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *k3, unsigned char *p,
+ int n)
+{
+ int i;
+ for (i = 0; i < 16; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ *k3++ = rand();
+ }
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 128 encryption */
+static inline
+ int openssl_aes_128_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 128 decryption */
+static inline
+ int openssl_aes_128_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *ct, unsigned char *dt)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_128_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+
+ unsigned char key1[16], key2[16], tinit[16];
+ unsigned char *pt, *ct, *dt, *refct, *refdt;
+ unsigned char keyssl[32]; /* SSL takes both keys together */
+ unsigned int rand_len, t;
+ int i, j, k;
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ /* Allocate space for input and output buffers */
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /**************************** FIXED LENGTH TEST *************************/
+ printf("aes_xts_128_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN);
+
+ // Loop over the vectors
+ for (i = 0; i < TEST_LOOPS; i++) {
+
+ mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 16; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 16] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_128_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_128_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+
+ /* Carry out comparison of the calculated ciphertext with
+ * the reference
+ */
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_128_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_128_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_128_xts_dec(&ctx, keyssl, tinit, TEST_LEN, refct, refdt);
+
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_128_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ /**************************** RANDOM LENGTH TEST *************************/
+ printf("aes_xts_128_rand_ossl test, %d sets of random lengths: ", RANDOMS);
+
+ /* Run tests with random size */
+
+ for (t = 0; t < RANDOMS; t++) {
+
+ rand_len = rand() % (TEST_LEN);
+ mk_rand_data(key1, key2, tinit, pt, rand_len);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 16; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 16] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_128_enc(key2, key1, tinit, rand_len, pt, ct);
+ openssl_aes_128_xts_enc(&ctx, keyssl, tinit, rand_len, pt, refct);
+
+ /* Carry out comparison of the calculated ciphertext with
+ * the reference
+ */
+ for (j = 0; j < rand_len; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_128_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_128_dec(key2, key1, tinit, rand_len, ct, dt);
+ openssl_aes_128_xts_dec(&ctx, keyssl, tinit, rand_len, refct, refdt);
+
+ for (j = 0; j < rand_len; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_128_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ printf("aes_xts_128_rand_ossl: All tests passed\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c
new file mode 100644
index 000000000..4092d2dd2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_test.c
@@ -0,0 +1,106 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "xts_128_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return -1;
+ }
+
+ XTS_AES_128_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ // Vectors 1-10 and 15-19 are for the 128 bit code
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return -1;
+ }
+
+ XTS_AES_128_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ printf("\nXTS_AES_128_enc: Vector %d: ",
+ i < 9 ? i + 1 : i + 6);
+
+ printf(" failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h
new file mode 100644
index 000000000..55a53bc27
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_128_vect.h
@@ -0,0 +1,1691 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+
+#define NVEC 14
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+struct xts_vector {
+ uint64_t ptlen; // length of our plaintext
+ uint8_t *key1; // dimension 16 for 128 bit aes
+ uint8_t *key2; // dimension 16 for 128 bit aes
+ uint8_t *TW; // dimension 16 for both 128 and 256 bit
+ uint8_t *PTX; // min. dimension 16
+ uint8_t *CTX; // same dimension as PTX
+};
+
+/* Define our test vectors statically here. Test vectors are from the standard:
+ * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented
+ * Storage Devices"
+ * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450
+ *
+ * Vector 1
+ * Key1 00000000000000000000000000000000
+ * Key2 00000000000000000000000000000000
+ * Data Unit Sequence number 0
+ * PTX 0000000000000000000000000000000000000000000000000000000000000000 /128bit
+ * TWK 66e94bd4ef8a2c3b884cfa59ca342b2eccd297a8df1559761099f4b39469565c
+ * CTX 917cf69ebd68b2ec9b9fe9a3eadda692cd43d2f59598ed858c02c2652fbf922e
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v1_key1[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_key2[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_TW[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_PTX[32] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v1_CTX[32] = {
+ 0x91, 0x7c, 0xf6, 0x9e, 0xbd, 0x68, 0xb2, 0xec,
+ 0x9b, 0x9f, 0xe9, 0xa3, 0xea, 0xdd, 0xa6, 0x92,
+ 0xcd, 0x43, 0xd2, 0xf5, 0x95, 0x98, 0xed, 0x85,
+ 0x8c, 0x02, 0xc2, 0x65, 0x2f, 0xbf, 0x92, 0x2e
+};
+
+/*
+ * Vector 2
+ * Key1 11111111111111111111111111111111
+ * Key2 22222222222222222222222222222222
+ * Data Unit Sequence number 3333333333
+ * PTX 4444444444444444444444444444444444444444444444444444444444444444
+ * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d
+ * CTX c454185e6a16936e39334038acef838bfb186fff7480adc4289382ecd6d394f0
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v2_key1[16] = {
+ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+ 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11
+};
+
+static uint8_t v2_key2[16] = {
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22
+};
+
+static uint8_t v2_TW[16] = {
+ 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v2_PTX[32] = {
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44
+};
+
+static uint8_t v2_CTX[32] = {
+ 0xc4, 0x54, 0x18, 0x5e, 0x6a, 0x16, 0x93, 0x6e,
+ 0x39, 0x33, 0x40, 0x38, 0xac, 0xef, 0x83, 0x8b,
+ 0xfb, 0x18, 0x6f, 0xff, 0x74, 0x80, 0xad, 0xc4,
+ 0x28, 0x93, 0x82, 0xec, 0xd6, 0xd3, 0x94, 0xf0
+};
+
+/*
+ * Vector 3
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 22222222222222222222222222222222
+ * Data Unit Sequence number 3333333333
+ * PTX 4444444444444444444444444444444444444444444444444444444444444444
+ * TWK 3f803bcd0d7fd2b37558419f59d5cda6f900779a1bfea467ebb0823eb3aa9b4d
+ * CTX af85336b597afc1a900b2eb21ec949d292df4c047e0b21532186a5971a227a89
+ * Plaintext length (bytes): 32
+ */
+
+static uint8_t v3_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v3_key2[16] = {
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22
+};
+
+static uint8_t v3_TW[16] = {
+ 0x33, 0x33, 0x33, 0x33, 0x33, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v3_PTX[32] = {
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44,
+ 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44, 0x44
+};
+
+static uint8_t v3_CTX[32] = {
+ 0xaf, 0x85, 0x33, 0x6b, 0x59, 0x7a, 0xfc, 0x1a,
+ 0x90, 0x0b, 0x2e, 0xb2, 0x1e, 0xc9, 0x49, 0xd2,
+ 0x92, 0xdf, 0x4c, 0x04, 0x7e, 0x0b, 0x21, 0x53,
+ 0x21, 0x86, 0xa5, 0x97, 0x1a, 0x22, 0x7a, 0x89
+};
+
+/*
+ * Vector 4
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence number 0
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c
+ * CTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412
+ * CTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce
+ * CTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265
+ * CTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8
+ * CTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434
+ * CTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c
+ * CTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e
+ * CTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc
+ * CTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3
+ * CTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344
+ * CTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd
+ * CTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752
+ * CTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e
+ * CTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d
+ * CTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v4_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v4_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v4_TW[16] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v4_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v4_CTX[512] = {
+ 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76,
+ 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2,
+ 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25,
+ 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c,
+ 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f,
+ 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00,
+ 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad,
+ 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12,
+ 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5,
+ 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5,
+ 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc,
+ 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce,
+ 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4,
+ 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84,
+ 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a,
+ 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65,
+ 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89,
+ 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51,
+ 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15,
+ 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8,
+ 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed,
+ 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91,
+ 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e,
+ 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34,
+ 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b,
+ 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5,
+ 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4,
+ 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c,
+ 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd,
+ 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3,
+ 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f,
+ 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e,
+ 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91,
+ 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19,
+ 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1,
+ 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc,
+ 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed,
+ 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde,
+ 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98,
+ 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3,
+ 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca,
+ 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6,
+ 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc,
+ 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44,
+ 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0,
+ 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95,
+ 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4,
+ 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd,
+ 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13,
+ 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7,
+ 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a,
+ 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52,
+ 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a,
+ 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38,
+ 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e,
+ 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e,
+ 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad,
+ 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8,
+ 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c,
+ 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d,
+ 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f,
+ 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2,
+ 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea,
+ 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68
+};
+
+/*
+ * Vector 5
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number 01
+ * PTX 27a7479befa1d476489f308cd4cfa6e2a96e4bbe3208ff25287dd3819616e89c
+ * PTX c78cf7f5e543445f8333d8fa7f56000005279fa5d8b5e4ad40e736ddb4d35412
+ * PTX 328063fd2aab53e5ea1e0a9f332500a5df9487d07a5c92cc512c8866c7e860ce
+ * PTX 93fdf166a24912b422976146ae20ce846bb7dc9ba94a767aaef20c0d61ad0265
+ * PTX 5ea92dc4c4e41a8952c651d33174be51a10c421110e6d81588ede82103a252d8
+ * PTX a750e8768defffed9122810aaeb99f9172af82b604dc4b8e51bcb08235a6f434
+ * PTX 1332e4ca60482a4ba1a03b3e65008fc5da76b70bf1690db4eae29c5f1badd03c
+ * PTX 5ccf2a55d705ddcd86d449511ceb7ec30bf12b1fa35b913f9f747a8afd1b130e
+ * PTX 94bff94effd01a91735ca1726acd0b197c4e5b03393697e126826fb6bbde8ecc
+ * PTX 1e08298516e2c9ed03ff3c1b7860f6de76d4cecd94c8119855ef5297ca67e9f3
+ * PTX e7ff72b1e99785ca0a7e7720c5b36dc6d72cac9574c8cbbc2f801e23e56fd344
+ * PTX b07f22154beba0f08ce8891e643ed995c94d9a69c9f1b5f499027a78572aeebd
+ * PTX 74d20cc39881c213ee770b1010e4bea718846977ae119f7a023ab58cca0ad752
+ * PTX afe656bb3c17256a9f6e9bf19fdd5a38fc82bbe872c5539edb609ef4f79c203e
+ * PTX bb140f2e583cb2ad15b4aa5b655016a8449277dbd477ef2c8d6c017db738b18d
+ * PTX eb4a427d1923ce3ff262735779a418f20a282df920147beabe421ee5319d0568
+ * CTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5
+ * CTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb
+ * CTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f
+ * CTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501
+ * CTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99
+ * CTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407
+ * CTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3
+ * CTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb
+ * CTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd
+ * CTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114
+ * CTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed
+ * CTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62
+ * CTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284
+ * CTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32
+ * CTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c
+ * CTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd
+ * Plaintext length (bytes): 512
+ */
+
+static uint8_t v5_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v5_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v5_TW[16] = {
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v5_PTX[512] = {
+ 0x27, 0xa7, 0x47, 0x9b, 0xef, 0xa1, 0xd4, 0x76,
+ 0x48, 0x9f, 0x30, 0x8c, 0xd4, 0xcf, 0xa6, 0xe2,
+ 0xa9, 0x6e, 0x4b, 0xbe, 0x32, 0x08, 0xff, 0x25,
+ 0x28, 0x7d, 0xd3, 0x81, 0x96, 0x16, 0xe8, 0x9c,
+ 0xc7, 0x8c, 0xf7, 0xf5, 0xe5, 0x43, 0x44, 0x5f,
+ 0x83, 0x33, 0xd8, 0xfa, 0x7f, 0x56, 0x00, 0x00,
+ 0x05, 0x27, 0x9f, 0xa5, 0xd8, 0xb5, 0xe4, 0xad,
+ 0x40, 0xe7, 0x36, 0xdd, 0xb4, 0xd3, 0x54, 0x12,
+ 0x32, 0x80, 0x63, 0xfd, 0x2a, 0xab, 0x53, 0xe5,
+ 0xea, 0x1e, 0x0a, 0x9f, 0x33, 0x25, 0x00, 0xa5,
+ 0xdf, 0x94, 0x87, 0xd0, 0x7a, 0x5c, 0x92, 0xcc,
+ 0x51, 0x2c, 0x88, 0x66, 0xc7, 0xe8, 0x60, 0xce,
+ 0x93, 0xfd, 0xf1, 0x66, 0xa2, 0x49, 0x12, 0xb4,
+ 0x22, 0x97, 0x61, 0x46, 0xae, 0x20, 0xce, 0x84,
+ 0x6b, 0xb7, 0xdc, 0x9b, 0xa9, 0x4a, 0x76, 0x7a,
+ 0xae, 0xf2, 0x0c, 0x0d, 0x61, 0xad, 0x02, 0x65,
+ 0x5e, 0xa9, 0x2d, 0xc4, 0xc4, 0xe4, 0x1a, 0x89,
+ 0x52, 0xc6, 0x51, 0xd3, 0x31, 0x74, 0xbe, 0x51,
+ 0xa1, 0x0c, 0x42, 0x11, 0x10, 0xe6, 0xd8, 0x15,
+ 0x88, 0xed, 0xe8, 0x21, 0x03, 0xa2, 0x52, 0xd8,
+ 0xa7, 0x50, 0xe8, 0x76, 0x8d, 0xef, 0xff, 0xed,
+ 0x91, 0x22, 0x81, 0x0a, 0xae, 0xb9, 0x9f, 0x91,
+ 0x72, 0xaf, 0x82, 0xb6, 0x04, 0xdc, 0x4b, 0x8e,
+ 0x51, 0xbc, 0xb0, 0x82, 0x35, 0xa6, 0xf4, 0x34,
+ 0x13, 0x32, 0xe4, 0xca, 0x60, 0x48, 0x2a, 0x4b,
+ 0xa1, 0xa0, 0x3b, 0x3e, 0x65, 0x00, 0x8f, 0xc5,
+ 0xda, 0x76, 0xb7, 0x0b, 0xf1, 0x69, 0x0d, 0xb4,
+ 0xea, 0xe2, 0x9c, 0x5f, 0x1b, 0xad, 0xd0, 0x3c,
+ 0x5c, 0xcf, 0x2a, 0x55, 0xd7, 0x05, 0xdd, 0xcd,
+ 0x86, 0xd4, 0x49, 0x51, 0x1c, 0xeb, 0x7e, 0xc3,
+ 0x0b, 0xf1, 0x2b, 0x1f, 0xa3, 0x5b, 0x91, 0x3f,
+ 0x9f, 0x74, 0x7a, 0x8a, 0xfd, 0x1b, 0x13, 0x0e,
+ 0x94, 0xbf, 0xf9, 0x4e, 0xff, 0xd0, 0x1a, 0x91,
+ 0x73, 0x5c, 0xa1, 0x72, 0x6a, 0xcd, 0x0b, 0x19,
+ 0x7c, 0x4e, 0x5b, 0x03, 0x39, 0x36, 0x97, 0xe1,
+ 0x26, 0x82, 0x6f, 0xb6, 0xbb, 0xde, 0x8e, 0xcc,
+ 0x1e, 0x08, 0x29, 0x85, 0x16, 0xe2, 0xc9, 0xed,
+ 0x03, 0xff, 0x3c, 0x1b, 0x78, 0x60, 0xf6, 0xde,
+ 0x76, 0xd4, 0xce, 0xcd, 0x94, 0xc8, 0x11, 0x98,
+ 0x55, 0xef, 0x52, 0x97, 0xca, 0x67, 0xe9, 0xf3,
+ 0xe7, 0xff, 0x72, 0xb1, 0xe9, 0x97, 0x85, 0xca,
+ 0x0a, 0x7e, 0x77, 0x20, 0xc5, 0xb3, 0x6d, 0xc6,
+ 0xd7, 0x2c, 0xac, 0x95, 0x74, 0xc8, 0xcb, 0xbc,
+ 0x2f, 0x80, 0x1e, 0x23, 0xe5, 0x6f, 0xd3, 0x44,
+ 0xb0, 0x7f, 0x22, 0x15, 0x4b, 0xeb, 0xa0, 0xf0,
+ 0x8c, 0xe8, 0x89, 0x1e, 0x64, 0x3e, 0xd9, 0x95,
+ 0xc9, 0x4d, 0x9a, 0x69, 0xc9, 0xf1, 0xb5, 0xf4,
+ 0x99, 0x02, 0x7a, 0x78, 0x57, 0x2a, 0xee, 0xbd,
+ 0x74, 0xd2, 0x0c, 0xc3, 0x98, 0x81, 0xc2, 0x13,
+ 0xee, 0x77, 0x0b, 0x10, 0x10, 0xe4, 0xbe, 0xa7,
+ 0x18, 0x84, 0x69, 0x77, 0xae, 0x11, 0x9f, 0x7a,
+ 0x02, 0x3a, 0xb5, 0x8c, 0xca, 0x0a, 0xd7, 0x52,
+ 0xaf, 0xe6, 0x56, 0xbb, 0x3c, 0x17, 0x25, 0x6a,
+ 0x9f, 0x6e, 0x9b, 0xf1, 0x9f, 0xdd, 0x5a, 0x38,
+ 0xfc, 0x82, 0xbb, 0xe8, 0x72, 0xc5, 0x53, 0x9e,
+ 0xdb, 0x60, 0x9e, 0xf4, 0xf7, 0x9c, 0x20, 0x3e,
+ 0xbb, 0x14, 0x0f, 0x2e, 0x58, 0x3c, 0xb2, 0xad,
+ 0x15, 0xb4, 0xaa, 0x5b, 0x65, 0x50, 0x16, 0xa8,
+ 0x44, 0x92, 0x77, 0xdb, 0xd4, 0x77, 0xef, 0x2c,
+ 0x8d, 0x6c, 0x01, 0x7d, 0xb7, 0x38, 0xb1, 0x8d,
+ 0xeb, 0x4a, 0x42, 0x7d, 0x19, 0x23, 0xce, 0x3f,
+ 0xf2, 0x62, 0x73, 0x57, 0x79, 0xa4, 0x18, 0xf2,
+ 0x0a, 0x28, 0x2d, 0xf9, 0x20, 0x14, 0x7b, 0xea,
+ 0xbe, 0x42, 0x1e, 0xe5, 0x31, 0x9d, 0x05, 0x68
+};
+
+static uint8_t v5_CTX[512] = {
+ 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe,
+ 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f,
+ 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60,
+ 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5,
+ 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d,
+ 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce,
+ 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b,
+ 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb,
+ 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49,
+ 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3,
+ 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28,
+ 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f,
+ 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca,
+ 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2,
+ 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa,
+ 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01,
+ 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43,
+ 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d,
+ 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58,
+ 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99,
+ 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3,
+ 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f,
+ 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e,
+ 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07,
+ 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a,
+ 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52,
+ 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c,
+ 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3,
+ 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16,
+ 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b,
+ 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6,
+ 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb,
+ 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5,
+ 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0,
+ 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44,
+ 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd,
+ 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb,
+ 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1,
+ 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf,
+ 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14,
+ 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7,
+ 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d,
+ 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c,
+ 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed,
+ 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7,
+ 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97,
+ 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32,
+ 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62,
+ 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75,
+ 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce,
+ 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04,
+ 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84,
+ 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43,
+ 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95,
+ 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4,
+ 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32,
+ 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab,
+ 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2,
+ 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d,
+ 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c,
+ 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4,
+ 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d,
+ 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a,
+ 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd
+};
+
+/*
+ * Vector 6
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number 02
+ * PTX 264d3ca8512194fec312c8c9891f279fefdd608d0c027b60483a3fa811d65ee5
+ * PTX 9d52d9e40ec5672d81532b38b6b089ce951f0f9c35590b8b978d175213f329bb
+ * PTX 1c2fd30f2f7f30492a61a532a79f51d36f5e31a7c9a12c286082ff7d2394d18f
+ * PTX 783e1a8e72c722caaaa52d8f065657d2631fd25bfd8e5baad6e527d763517501
+ * PTX c68c5edc3cdd55435c532d7125c8614deed9adaa3acade5888b87bef641c4c99
+ * PTX 4c8091b5bcd387f3963fb5bc37aa922fbfe3df4e5b915e6eb514717bdd2a7407
+ * PTX 9a5073f5c4bfd46adf7d282e7a393a52579d11a028da4d9cd9c77124f9648ee3
+ * PTX 83b1ac763930e7162a8d37f350b2f74b8472cf09902063c6b32e8c2d9290cefb
+ * PTX d7346d1c779a0df50edcde4531da07b099c638e83a755944df2aef1aa31752fd
+ * PTX 323dcb710fb4bfbb9d22b925bc3577e1b8949e729a90bbafeacf7f7879e7b114
+ * PTX 7e28ba0bae940db795a61b15ecf4df8db07b824bb062802cc98a9545bb2aaeed
+ * PTX 77cb3fc6db15dcd7d80d7d5bc406c4970a3478ada8899b329198eb61c193fb62
+ * PTX 75aa8ca340344a75a862aebe92eee1ce032fd950b47d7704a3876923b4ad6284
+ * PTX 4bf4a09c4dbe8b4397184b7471360c9564880aedddb9baa4af2e75394b08cd32
+ * PTX ff479c57a07d3eab5d54de5f9738b8d27f27a9f0ab11799d7b7ffefb2704c95c
+ * PTX 6ad12c39f1e867a4b7b1d7818a4b753dfd2a89ccb45e001a03a867b187f225dd
+ * CTX fa762a3680b76007928ed4a4f49a9456031b704782e65e16cecb54ed7d017b5e
+ * CTX 18abd67b338e81078f21edb7868d901ebe9c731a7c18b5e6dec1d6a72e078ac9
+ * CTX a4262f860beefa14f4e821018272e411a951502b6e79066e84252c3346f3aa62
+ * CTX 344351a291d4bedc7a07618bdea2af63145cc7a4b8d4070691ae890cd65733e7
+ * CTX 946e9021a1dffc4c59f159425ee6d50ca9b135fa6162cea18a939838dc000fb3
+ * CTX 86fad086acce5ac07cb2ece7fd580b00cfa5e98589631dc25e8e2a3daf2ffdec
+ * CTX 26531659912c9d8f7a15e5865ea8fb5816d6207052bd7128cd743c12c8118791
+ * CTX a4736811935eb982a532349e31dd401e0b660a568cb1a4711f552f55ded59f1f
+ * CTX 15bf7196b3ca12a91e488ef59d64f3a02bf45239499ac6176ae321c4a211ec54
+ * CTX 5365971c5d3f4f09d4eb139bfdf2073d33180b21002b65cc9865e76cb24cd92c
+ * CTX 874c24c18350399a936ab3637079295d76c417776b94efce3a0ef7206b151105
+ * CTX 19655c956cbd8b2489405ee2b09a6b6eebe0c53790a12a8998378b33a5b71159
+ * CTX 625f4ba49d2a2fdba59fbf0897bc7aabd8d707dc140a80f0f309f835d3da54ab
+ * CTX 584e501dfa0ee977fec543f74186a802b9a37adb3e8291eca04d66520d229e60
+ * CTX 401e7282bef486ae059aa70696e0e305d777140a7a883ecdcb69b9ff938e8a42
+ * CTX 31864c69ca2c2043bed007ff3e605e014bcf518138dc3a25c5e236171a2d01d6
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v6_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v6_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v6_TW[16] = {
+ 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v6_PTX[512] = {
+
+ 0x26, 0x4d, 0x3c, 0xa8, 0x51, 0x21, 0x94, 0xfe,
+ 0xc3, 0x12, 0xc8, 0xc9, 0x89, 0x1f, 0x27, 0x9f,
+ 0xef, 0xdd, 0x60, 0x8d, 0x0c, 0x02, 0x7b, 0x60,
+ 0x48, 0x3a, 0x3f, 0xa8, 0x11, 0xd6, 0x5e, 0xe5,
+ 0x9d, 0x52, 0xd9, 0xe4, 0x0e, 0xc5, 0x67, 0x2d,
+ 0x81, 0x53, 0x2b, 0x38, 0xb6, 0xb0, 0x89, 0xce,
+ 0x95, 0x1f, 0x0f, 0x9c, 0x35, 0x59, 0x0b, 0x8b,
+ 0x97, 0x8d, 0x17, 0x52, 0x13, 0xf3, 0x29, 0xbb,
+ 0x1c, 0x2f, 0xd3, 0x0f, 0x2f, 0x7f, 0x30, 0x49,
+ 0x2a, 0x61, 0xa5, 0x32, 0xa7, 0x9f, 0x51, 0xd3,
+ 0x6f, 0x5e, 0x31, 0xa7, 0xc9, 0xa1, 0x2c, 0x28,
+ 0x60, 0x82, 0xff, 0x7d, 0x23, 0x94, 0xd1, 0x8f,
+ 0x78, 0x3e, 0x1a, 0x8e, 0x72, 0xc7, 0x22, 0xca,
+ 0xaa, 0xa5, 0x2d, 0x8f, 0x06, 0x56, 0x57, 0xd2,
+ 0x63, 0x1f, 0xd2, 0x5b, 0xfd, 0x8e, 0x5b, 0xaa,
+ 0xd6, 0xe5, 0x27, 0xd7, 0x63, 0x51, 0x75, 0x01,
+ 0xc6, 0x8c, 0x5e, 0xdc, 0x3c, 0xdd, 0x55, 0x43,
+ 0x5c, 0x53, 0x2d, 0x71, 0x25, 0xc8, 0x61, 0x4d,
+ 0xee, 0xd9, 0xad, 0xaa, 0x3a, 0xca, 0xde, 0x58,
+ 0x88, 0xb8, 0x7b, 0xef, 0x64, 0x1c, 0x4c, 0x99,
+ 0x4c, 0x80, 0x91, 0xb5, 0xbc, 0xd3, 0x87, 0xf3,
+ 0x96, 0x3f, 0xb5, 0xbc, 0x37, 0xaa, 0x92, 0x2f,
+ 0xbf, 0xe3, 0xdf, 0x4e, 0x5b, 0x91, 0x5e, 0x6e,
+ 0xb5, 0x14, 0x71, 0x7b, 0xdd, 0x2a, 0x74, 0x07,
+ 0x9a, 0x50, 0x73, 0xf5, 0xc4, 0xbf, 0xd4, 0x6a,
+ 0xdf, 0x7d, 0x28, 0x2e, 0x7a, 0x39, 0x3a, 0x52,
+ 0x57, 0x9d, 0x11, 0xa0, 0x28, 0xda, 0x4d, 0x9c,
+ 0xd9, 0xc7, 0x71, 0x24, 0xf9, 0x64, 0x8e, 0xe3,
+ 0x83, 0xb1, 0xac, 0x76, 0x39, 0x30, 0xe7, 0x16,
+ 0x2a, 0x8d, 0x37, 0xf3, 0x50, 0xb2, 0xf7, 0x4b,
+ 0x84, 0x72, 0xcf, 0x09, 0x90, 0x20, 0x63, 0xc6,
+ 0xb3, 0x2e, 0x8c, 0x2d, 0x92, 0x90, 0xce, 0xfb,
+ 0xd7, 0x34, 0x6d, 0x1c, 0x77, 0x9a, 0x0d, 0xf5,
+ 0x0e, 0xdc, 0xde, 0x45, 0x31, 0xda, 0x07, 0xb0,
+ 0x99, 0xc6, 0x38, 0xe8, 0x3a, 0x75, 0x59, 0x44,
+ 0xdf, 0x2a, 0xef, 0x1a, 0xa3, 0x17, 0x52, 0xfd,
+ 0x32, 0x3d, 0xcb, 0x71, 0x0f, 0xb4, 0xbf, 0xbb,
+ 0x9d, 0x22, 0xb9, 0x25, 0xbc, 0x35, 0x77, 0xe1,
+ 0xb8, 0x94, 0x9e, 0x72, 0x9a, 0x90, 0xbb, 0xaf,
+ 0xea, 0xcf, 0x7f, 0x78, 0x79, 0xe7, 0xb1, 0x14,
+ 0x7e, 0x28, 0xba, 0x0b, 0xae, 0x94, 0x0d, 0xb7,
+ 0x95, 0xa6, 0x1b, 0x15, 0xec, 0xf4, 0xdf, 0x8d,
+ 0xb0, 0x7b, 0x82, 0x4b, 0xb0, 0x62, 0x80, 0x2c,
+ 0xc9, 0x8a, 0x95, 0x45, 0xbb, 0x2a, 0xae, 0xed,
+ 0x77, 0xcb, 0x3f, 0xc6, 0xdb, 0x15, 0xdc, 0xd7,
+ 0xd8, 0x0d, 0x7d, 0x5b, 0xc4, 0x06, 0xc4, 0x97,
+ 0x0a, 0x34, 0x78, 0xad, 0xa8, 0x89, 0x9b, 0x32,
+ 0x91, 0x98, 0xeb, 0x61, 0xc1, 0x93, 0xfb, 0x62,
+ 0x75, 0xaa, 0x8c, 0xa3, 0x40, 0x34, 0x4a, 0x75,
+ 0xa8, 0x62, 0xae, 0xbe, 0x92, 0xee, 0xe1, 0xce,
+ 0x03, 0x2f, 0xd9, 0x50, 0xb4, 0x7d, 0x77, 0x04,
+ 0xa3, 0x87, 0x69, 0x23, 0xb4, 0xad, 0x62, 0x84,
+ 0x4b, 0xf4, 0xa0, 0x9c, 0x4d, 0xbe, 0x8b, 0x43,
+ 0x97, 0x18, 0x4b, 0x74, 0x71, 0x36, 0x0c, 0x95,
+ 0x64, 0x88, 0x0a, 0xed, 0xdd, 0xb9, 0xba, 0xa4,
+ 0xaf, 0x2e, 0x75, 0x39, 0x4b, 0x08, 0xcd, 0x32,
+ 0xff, 0x47, 0x9c, 0x57, 0xa0, 0x7d, 0x3e, 0xab,
+ 0x5d, 0x54, 0xde, 0x5f, 0x97, 0x38, 0xb8, 0xd2,
+ 0x7f, 0x27, 0xa9, 0xf0, 0xab, 0x11, 0x79, 0x9d,
+ 0x7b, 0x7f, 0xfe, 0xfb, 0x27, 0x04, 0xc9, 0x5c,
+ 0x6a, 0xd1, 0x2c, 0x39, 0xf1, 0xe8, 0x67, 0xa4,
+ 0xb7, 0xb1, 0xd7, 0x81, 0x8a, 0x4b, 0x75, 0x3d,
+ 0xfd, 0x2a, 0x89, 0xcc, 0xb4, 0x5e, 0x00, 0x1a,
+ 0x03, 0xa8, 0x67, 0xb1, 0x87, 0xf2, 0x25, 0xdd
+};
+
+static uint8_t v6_CTX[512] = {
+
+ 0xfa, 0x76, 0x2a, 0x36, 0x80, 0xb7, 0x60, 0x07,
+ 0x92, 0x8e, 0xd4, 0xa4, 0xf4, 0x9a, 0x94, 0x56,
+ 0x03, 0x1b, 0x70, 0x47, 0x82, 0xe6, 0x5e, 0x16,
+ 0xce, 0xcb, 0x54, 0xed, 0x7d, 0x01, 0x7b, 0x5e,
+ 0x18, 0xab, 0xd6, 0x7b, 0x33, 0x8e, 0x81, 0x07,
+ 0x8f, 0x21, 0xed, 0xb7, 0x86, 0x8d, 0x90, 0x1e,
+ 0xbe, 0x9c, 0x73, 0x1a, 0x7c, 0x18, 0xb5, 0xe6,
+ 0xde, 0xc1, 0xd6, 0xa7, 0x2e, 0x07, 0x8a, 0xc9,
+ 0xa4, 0x26, 0x2f, 0x86, 0x0b, 0xee, 0xfa, 0x14,
+ 0xf4, 0xe8, 0x21, 0x01, 0x82, 0x72, 0xe4, 0x11,
+ 0xa9, 0x51, 0x50, 0x2b, 0x6e, 0x79, 0x06, 0x6e,
+ 0x84, 0x25, 0x2c, 0x33, 0x46, 0xf3, 0xaa, 0x62,
+ 0x34, 0x43, 0x51, 0xa2, 0x91, 0xd4, 0xbe, 0xdc,
+ 0x7a, 0x07, 0x61, 0x8b, 0xde, 0xa2, 0xaf, 0x63,
+ 0x14, 0x5c, 0xc7, 0xa4, 0xb8, 0xd4, 0x07, 0x06,
+ 0x91, 0xae, 0x89, 0x0c, 0xd6, 0x57, 0x33, 0xe7,
+ 0x94, 0x6e, 0x90, 0x21, 0xa1, 0xdf, 0xfc, 0x4c,
+ 0x59, 0xf1, 0x59, 0x42, 0x5e, 0xe6, 0xd5, 0x0c,
+ 0xa9, 0xb1, 0x35, 0xfa, 0x61, 0x62, 0xce, 0xa1,
+ 0x8a, 0x93, 0x98, 0x38, 0xdc, 0x00, 0x0f, 0xb3,
+ 0x86, 0xfa, 0xd0, 0x86, 0xac, 0xce, 0x5a, 0xc0,
+ 0x7c, 0xb2, 0xec, 0xe7, 0xfd, 0x58, 0x0b, 0x00,
+ 0xcf, 0xa5, 0xe9, 0x85, 0x89, 0x63, 0x1d, 0xc2,
+ 0x5e, 0x8e, 0x2a, 0x3d, 0xaf, 0x2f, 0xfd, 0xec,
+ 0x26, 0x53, 0x16, 0x59, 0x91, 0x2c, 0x9d, 0x8f,
+ 0x7a, 0x15, 0xe5, 0x86, 0x5e, 0xa8, 0xfb, 0x58,
+ 0x16, 0xd6, 0x20, 0x70, 0x52, 0xbd, 0x71, 0x28,
+ 0xcd, 0x74, 0x3c, 0x12, 0xc8, 0x11, 0x87, 0x91,
+ 0xa4, 0x73, 0x68, 0x11, 0x93, 0x5e, 0xb9, 0x82,
+ 0xa5, 0x32, 0x34, 0x9e, 0x31, 0xdd, 0x40, 0x1e,
+ 0x0b, 0x66, 0x0a, 0x56, 0x8c, 0xb1, 0xa4, 0x71,
+ 0x1f, 0x55, 0x2f, 0x55, 0xde, 0xd5, 0x9f, 0x1f,
+ 0x15, 0xbf, 0x71, 0x96, 0xb3, 0xca, 0x12, 0xa9,
+ 0x1e, 0x48, 0x8e, 0xf5, 0x9d, 0x64, 0xf3, 0xa0,
+ 0x2b, 0xf4, 0x52, 0x39, 0x49, 0x9a, 0xc6, 0x17,
+ 0x6a, 0xe3, 0x21, 0xc4, 0xa2, 0x11, 0xec, 0x54,
+ 0x53, 0x65, 0x97, 0x1c, 0x5d, 0x3f, 0x4f, 0x09,
+ 0xd4, 0xeb, 0x13, 0x9b, 0xfd, 0xf2, 0x07, 0x3d,
+ 0x33, 0x18, 0x0b, 0x21, 0x00, 0x2b, 0x65, 0xcc,
+ 0x98, 0x65, 0xe7, 0x6c, 0xb2, 0x4c, 0xd9, 0x2c,
+ 0x87, 0x4c, 0x24, 0xc1, 0x83, 0x50, 0x39, 0x9a,
+ 0x93, 0x6a, 0xb3, 0x63, 0x70, 0x79, 0x29, 0x5d,
+ 0x76, 0xc4, 0x17, 0x77, 0x6b, 0x94, 0xef, 0xce,
+ 0x3a, 0x0e, 0xf7, 0x20, 0x6b, 0x15, 0x11, 0x05,
+ 0x19, 0x65, 0x5c, 0x95, 0x6c, 0xbd, 0x8b, 0x24,
+ 0x89, 0x40, 0x5e, 0xe2, 0xb0, 0x9a, 0x6b, 0x6e,
+ 0xeb, 0xe0, 0xc5, 0x37, 0x90, 0xa1, 0x2a, 0x89,
+ 0x98, 0x37, 0x8b, 0x33, 0xa5, 0xb7, 0x11, 0x59,
+ 0x62, 0x5f, 0x4b, 0xa4, 0x9d, 0x2a, 0x2f, 0xdb,
+ 0xa5, 0x9f, 0xbf, 0x08, 0x97, 0xbc, 0x7a, 0xab,
+ 0xd8, 0xd7, 0x07, 0xdc, 0x14, 0x0a, 0x80, 0xf0,
+ 0xf3, 0x09, 0xf8, 0x35, 0xd3, 0xda, 0x54, 0xab,
+ 0x58, 0x4e, 0x50, 0x1d, 0xfa, 0x0e, 0xe9, 0x77,
+ 0xfe, 0xc5, 0x43, 0xf7, 0x41, 0x86, 0xa8, 0x02,
+ 0xb9, 0xa3, 0x7a, 0xdb, 0x3e, 0x82, 0x91, 0xec,
+ 0xa0, 0x4d, 0x66, 0x52, 0x0d, 0x22, 0x9e, 0x60,
+ 0x40, 0x1e, 0x72, 0x82, 0xbe, 0xf4, 0x86, 0xae,
+ 0x05, 0x9a, 0xa7, 0x06, 0x96, 0xe0, 0xe3, 0x05,
+ 0xd7, 0x77, 0x14, 0x0a, 0x7a, 0x88, 0x3e, 0xcd,
+ 0xcb, 0x69, 0xb9, 0xff, 0x93, 0x8e, 0x8a, 0x42,
+ 0x31, 0x86, 0x4c, 0x69, 0xca, 0x2c, 0x20, 0x43,
+ 0xbe, 0xd0, 0x07, 0xff, 0x3e, 0x60, 0x5e, 0x01,
+ 0x4b, 0xcf, 0x51, 0x81, 0x38, 0xdc, 0x3a, 0x25,
+ 0xc5, 0xe2, 0x36, 0x17, 0x1a, 0x2d, 0x01, 0xd6
+};
+
+/*
+ * Vector 7
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number fd
+ * PTX 8e41b78c390b5af9d758bb214a67e9f6bf7727b09ac6124084c37611398fa45d
+ * PTX aad94868600ed391fb1acd4857a95b466e62ef9f4b377244d1c152e7b30d731a
+ * PTX ad30c716d214b707aed99eb5b5e580b3e887cf7497465651d4b60e6042051da3
+ * PTX 693c3b78c14489543be8b6ad0ba629565bba202313ba7b0d0c94a3252b676f46
+ * PTX cc02ce0f8a7d34c0ed229129673c1f61aed579d08a9203a25aac3a77e9db6026
+ * PTX 7996db38df637356d9dcd1632e369939f2a29d89345c66e05066f1a3677aef18
+ * PTX dea4113faeb629e46721a66d0a7e785d3e29af2594eb67dfa982affe0aac058f
+ * PTX 6e15864269b135418261fc3afb089472cf68c45dd7f231c6249ba0255e1e0338
+ * PTX 33fc4d00a3fe02132d7bc3873614b8aee34273581ea0325c81f0270affa13641
+ * PTX d052d36f0757d484014354d02d6883ca15c24d8c3956b1bd027bcf41f151fd80
+ * PTX 23c5340e5606f37e90fdb87c86fb4fa634b3718a30bace06a66eaf8f63c4aa3b
+ * PTX 637826a87fe8cfa44282e92cb1615af3a28e53bc74c7cba1a0977be9065d0c1a
+ * PTX 5dec6c54ae38d37f37aa35283e048e5530a85c4e7a29d7b92ec0c3169cdf2a80
+ * PTX 5c7604bce60049b9fb7b8eaac10f51ae23794ceba68bb58112e293b9b692ca72
+ * PTX 1b37c662f8574ed4dba6f88e170881c82cddc1034a0ca7e284bf0962b6b26292
+ * PTX d836fa9f73c1ac770eef0f2d3a1eaf61d3e03555fd424eedd67e18a18094f888
+ * CTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3
+ * CTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22
+ * CTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce
+ * CTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c
+ * CTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8
+ * CTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b
+ * CTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82
+ * CTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969
+ * CTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5
+ * CTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b
+ * CTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec
+ * CTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d
+ * CTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef
+ * CTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280
+ * CTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3
+ * CTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v7_key1[16] = {
+
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v7_key2[16] = {
+
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v7_TW[16] = {
+
+ 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v7_PTX[512] = {
+
+ 0x8e, 0x41, 0xb7, 0x8c, 0x39, 0x0b, 0x5a, 0xf9,
+ 0xd7, 0x58, 0xbb, 0x21, 0x4a, 0x67, 0xe9, 0xf6,
+ 0xbf, 0x77, 0x27, 0xb0, 0x9a, 0xc6, 0x12, 0x40,
+ 0x84, 0xc3, 0x76, 0x11, 0x39, 0x8f, 0xa4, 0x5d,
+ 0xaa, 0xd9, 0x48, 0x68, 0x60, 0x0e, 0xd3, 0x91,
+ 0xfb, 0x1a, 0xcd, 0x48, 0x57, 0xa9, 0x5b, 0x46,
+ 0x6e, 0x62, 0xef, 0x9f, 0x4b, 0x37, 0x72, 0x44,
+ 0xd1, 0xc1, 0x52, 0xe7, 0xb3, 0x0d, 0x73, 0x1a,
+ 0xad, 0x30, 0xc7, 0x16, 0xd2, 0x14, 0xb7, 0x07,
+ 0xae, 0xd9, 0x9e, 0xb5, 0xb5, 0xe5, 0x80, 0xb3,
+ 0xe8, 0x87, 0xcf, 0x74, 0x97, 0x46, 0x56, 0x51,
+ 0xd4, 0xb6, 0x0e, 0x60, 0x42, 0x05, 0x1d, 0xa3,
+ 0x69, 0x3c, 0x3b, 0x78, 0xc1, 0x44, 0x89, 0x54,
+ 0x3b, 0xe8, 0xb6, 0xad, 0x0b, 0xa6, 0x29, 0x56,
+ 0x5b, 0xba, 0x20, 0x23, 0x13, 0xba, 0x7b, 0x0d,
+ 0x0c, 0x94, 0xa3, 0x25, 0x2b, 0x67, 0x6f, 0x46,
+ 0xcc, 0x02, 0xce, 0x0f, 0x8a, 0x7d, 0x34, 0xc0,
+ 0xed, 0x22, 0x91, 0x29, 0x67, 0x3c, 0x1f, 0x61,
+ 0xae, 0xd5, 0x79, 0xd0, 0x8a, 0x92, 0x03, 0xa2,
+ 0x5a, 0xac, 0x3a, 0x77, 0xe9, 0xdb, 0x60, 0x26,
+ 0x79, 0x96, 0xdb, 0x38, 0xdf, 0x63, 0x73, 0x56,
+ 0xd9, 0xdc, 0xd1, 0x63, 0x2e, 0x36, 0x99, 0x39,
+ 0xf2, 0xa2, 0x9d, 0x89, 0x34, 0x5c, 0x66, 0xe0,
+ 0x50, 0x66, 0xf1, 0xa3, 0x67, 0x7a, 0xef, 0x18,
+ 0xde, 0xa4, 0x11, 0x3f, 0xae, 0xb6, 0x29, 0xe4,
+ 0x67, 0x21, 0xa6, 0x6d, 0x0a, 0x7e, 0x78, 0x5d,
+ 0x3e, 0x29, 0xaf, 0x25, 0x94, 0xeb, 0x67, 0xdf,
+ 0xa9, 0x82, 0xaf, 0xfe, 0x0a, 0xac, 0x05, 0x8f,
+ 0x6e, 0x15, 0x86, 0x42, 0x69, 0xb1, 0x35, 0x41,
+ 0x82, 0x61, 0xfc, 0x3a, 0xfb, 0x08, 0x94, 0x72,
+ 0xcf, 0x68, 0xc4, 0x5d, 0xd7, 0xf2, 0x31, 0xc6,
+ 0x24, 0x9b, 0xa0, 0x25, 0x5e, 0x1e, 0x03, 0x38,
+ 0x33, 0xfc, 0x4d, 0x00, 0xa3, 0xfe, 0x02, 0x13,
+ 0x2d, 0x7b, 0xc3, 0x87, 0x36, 0x14, 0xb8, 0xae,
+ 0xe3, 0x42, 0x73, 0x58, 0x1e, 0xa0, 0x32, 0x5c,
+ 0x81, 0xf0, 0x27, 0x0a, 0xff, 0xa1, 0x36, 0x41,
+ 0xd0, 0x52, 0xd3, 0x6f, 0x07, 0x57, 0xd4, 0x84,
+ 0x01, 0x43, 0x54, 0xd0, 0x2d, 0x68, 0x83, 0xca,
+ 0x15, 0xc2, 0x4d, 0x8c, 0x39, 0x56, 0xb1, 0xbd,
+ 0x02, 0x7b, 0xcf, 0x41, 0xf1, 0x51, 0xfd, 0x80,
+ 0x23, 0xc5, 0x34, 0x0e, 0x56, 0x06, 0xf3, 0x7e,
+ 0x90, 0xfd, 0xb8, 0x7c, 0x86, 0xfb, 0x4f, 0xa6,
+ 0x34, 0xb3, 0x71, 0x8a, 0x30, 0xba, 0xce, 0x06,
+ 0xa6, 0x6e, 0xaf, 0x8f, 0x63, 0xc4, 0xaa, 0x3b,
+ 0x63, 0x78, 0x26, 0xa8, 0x7f, 0xe8, 0xcf, 0xa4,
+ 0x42, 0x82, 0xe9, 0x2c, 0xb1, 0x61, 0x5a, 0xf3,
+ 0xa2, 0x8e, 0x53, 0xbc, 0x74, 0xc7, 0xcb, 0xa1,
+ 0xa0, 0x97, 0x7b, 0xe9, 0x06, 0x5d, 0x0c, 0x1a,
+ 0x5d, 0xec, 0x6c, 0x54, 0xae, 0x38, 0xd3, 0x7f,
+ 0x37, 0xaa, 0x35, 0x28, 0x3e, 0x04, 0x8e, 0x55,
+ 0x30, 0xa8, 0x5c, 0x4e, 0x7a, 0x29, 0xd7, 0xb9,
+ 0x2e, 0xc0, 0xc3, 0x16, 0x9c, 0xdf, 0x2a, 0x80,
+ 0x5c, 0x76, 0x04, 0xbc, 0xe6, 0x00, 0x49, 0xb9,
+ 0xfb, 0x7b, 0x8e, 0xaa, 0xc1, 0x0f, 0x51, 0xae,
+ 0x23, 0x79, 0x4c, 0xeb, 0xa6, 0x8b, 0xb5, 0x81,
+ 0x12, 0xe2, 0x93, 0xb9, 0xb6, 0x92, 0xca, 0x72,
+ 0x1b, 0x37, 0xc6, 0x62, 0xf8, 0x57, 0x4e, 0xd4,
+ 0xdb, 0xa6, 0xf8, 0x8e, 0x17, 0x08, 0x81, 0xc8,
+ 0x2c, 0xdd, 0xc1, 0x03, 0x4a, 0x0c, 0xa7, 0xe2,
+ 0x84, 0xbf, 0x09, 0x62, 0xb6, 0xb2, 0x62, 0x92,
+ 0xd8, 0x36, 0xfa, 0x9f, 0x73, 0xc1, 0xac, 0x77,
+ 0x0e, 0xef, 0x0f, 0x2d, 0x3a, 0x1e, 0xaf, 0x61,
+ 0xd3, 0xe0, 0x35, 0x55, 0xfd, 0x42, 0x4e, 0xed,
+ 0xd6, 0x7e, 0x18, 0xa1, 0x80, 0x94, 0xf8, 0x88
+};
+
+static uint8_t v7_CTX[512] = {
+
+ 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e,
+ 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a,
+ 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9,
+ 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3,
+ 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07,
+ 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3,
+ 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08,
+ 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22,
+ 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c,
+ 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f,
+ 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd,
+ 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce,
+ 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f,
+ 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5,
+ 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc,
+ 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c,
+ 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3,
+ 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b,
+ 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec,
+ 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8,
+ 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee,
+ 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b,
+ 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3,
+ 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b,
+ 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd,
+ 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c,
+ 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2,
+ 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82,
+ 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d,
+ 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd,
+ 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6,
+ 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69,
+ 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e,
+ 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7,
+ 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53,
+ 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5,
+ 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58,
+ 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32,
+ 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a,
+ 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b,
+ 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17,
+ 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45,
+ 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54,
+ 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec,
+ 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0,
+ 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39,
+ 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66,
+ 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d,
+ 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83,
+ 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31,
+ 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03,
+ 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef,
+ 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18,
+ 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1,
+ 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65,
+ 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80,
+ 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23,
+ 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f,
+ 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49,
+ 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3,
+ 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78,
+ 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1,
+ 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc,
+ 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37
+};
+
+/*
+ * Vector 8
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number fe
+ * PTX d55f684f81f4426e9fde92a5ff02df2ac896af63962888a97910c1379e20b0a3
+ * PTX b1db613fb7fe2e07004329ea5c22bfd33e3dbe4cf58cc608c2c26c19a2e2fe22
+ * PTX f98732c2b5cb844cc6c0702d91e1d50fc4382a7eba5635cd602432a2306ac4ce
+ * PTX 82f8d70c8d9bc15f918fe71e74c622d5cf71178bf6e0b9cc9f2b41dd8dbe441c
+ * PTX 41cd0c73a6dc47a348f6702f9d0e9b1b1431e948e299b9ec2272ab2c5f0c7be8
+ * PTX 6affa5dec87a0bee81d3d50007edaa2bcfccb35605155ff36ed8edd4a40dcd4b
+ * PTX 243acd11b2b987bdbfaf91a7cac27e9c5aea525ee53de7b2d3332c8644402b82
+ * PTX 3e94a7db26276d2d23aa07180f76b4fd29b9c0823099c9d62c519880aee7e969
+ * PTX 7617c1497d47bf3e571950311421b6b734d38b0db91eb85331b91ea9f61530f5
+ * PTX 4512a5a52a4bad589eb69781d537f23297bb459bdad2948a29e1550bf4787e0b
+ * PTX e95bb173cf5fab17dab7a13a052a63453d97ccec1a321954886b7a1299faaeec
+ * PTX ae35c6eaaca753b041b5e5f093bf83397fd21dd6b3012066fcc058cc32c3b09d
+ * PTX 7562dee29509b5839392c9ff05f51f3166aaac4ac5f238038a3045e6f72e48ef
+ * PTX 0fe8bc675e82c318a268e43970271bf119b81bf6a982746554f84e72b9f00280
+ * PTX a320a08142923c23c883423ff949827f29bbacdc1ccdb04938ce6098c95ba6b3
+ * PTX 2528f4ef78eed778b2e122ddfd1cbdd11d1c0a6783e011fc536d63d053260637
+ * CTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395
+ * CTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae
+ * CTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94
+ * CTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0
+ * CTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd
+ * CTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d
+ * CTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792
+ * CTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e
+ * CTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18
+ * CTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b
+ * CTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb
+ * CTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a
+ * CTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d
+ * CTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4
+ * CTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1
+ * CTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v8_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v8_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v8_TW[16] = {
+ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v8_PTX[512] = {
+ 0xd5, 0x5f, 0x68, 0x4f, 0x81, 0xf4, 0x42, 0x6e,
+ 0x9f, 0xde, 0x92, 0xa5, 0xff, 0x02, 0xdf, 0x2a,
+ 0xc8, 0x96, 0xaf, 0x63, 0x96, 0x28, 0x88, 0xa9,
+ 0x79, 0x10, 0xc1, 0x37, 0x9e, 0x20, 0xb0, 0xa3,
+ 0xb1, 0xdb, 0x61, 0x3f, 0xb7, 0xfe, 0x2e, 0x07,
+ 0x00, 0x43, 0x29, 0xea, 0x5c, 0x22, 0xbf, 0xd3,
+ 0x3e, 0x3d, 0xbe, 0x4c, 0xf5, 0x8c, 0xc6, 0x08,
+ 0xc2, 0xc2, 0x6c, 0x19, 0xa2, 0xe2, 0xfe, 0x22,
+ 0xf9, 0x87, 0x32, 0xc2, 0xb5, 0xcb, 0x84, 0x4c,
+ 0xc6, 0xc0, 0x70, 0x2d, 0x91, 0xe1, 0xd5, 0x0f,
+ 0xc4, 0x38, 0x2a, 0x7e, 0xba, 0x56, 0x35, 0xcd,
+ 0x60, 0x24, 0x32, 0xa2, 0x30, 0x6a, 0xc4, 0xce,
+ 0x82, 0xf8, 0xd7, 0x0c, 0x8d, 0x9b, 0xc1, 0x5f,
+ 0x91, 0x8f, 0xe7, 0x1e, 0x74, 0xc6, 0x22, 0xd5,
+ 0xcf, 0x71, 0x17, 0x8b, 0xf6, 0xe0, 0xb9, 0xcc,
+ 0x9f, 0x2b, 0x41, 0xdd, 0x8d, 0xbe, 0x44, 0x1c,
+ 0x41, 0xcd, 0x0c, 0x73, 0xa6, 0xdc, 0x47, 0xa3,
+ 0x48, 0xf6, 0x70, 0x2f, 0x9d, 0x0e, 0x9b, 0x1b,
+ 0x14, 0x31, 0xe9, 0x48, 0xe2, 0x99, 0xb9, 0xec,
+ 0x22, 0x72, 0xab, 0x2c, 0x5f, 0x0c, 0x7b, 0xe8,
+ 0x6a, 0xff, 0xa5, 0xde, 0xc8, 0x7a, 0x0b, 0xee,
+ 0x81, 0xd3, 0xd5, 0x00, 0x07, 0xed, 0xaa, 0x2b,
+ 0xcf, 0xcc, 0xb3, 0x56, 0x05, 0x15, 0x5f, 0xf3,
+ 0x6e, 0xd8, 0xed, 0xd4, 0xa4, 0x0d, 0xcd, 0x4b,
+ 0x24, 0x3a, 0xcd, 0x11, 0xb2, 0xb9, 0x87, 0xbd,
+ 0xbf, 0xaf, 0x91, 0xa7, 0xca, 0xc2, 0x7e, 0x9c,
+ 0x5a, 0xea, 0x52, 0x5e, 0xe5, 0x3d, 0xe7, 0xb2,
+ 0xd3, 0x33, 0x2c, 0x86, 0x44, 0x40, 0x2b, 0x82,
+ 0x3e, 0x94, 0xa7, 0xdb, 0x26, 0x27, 0x6d, 0x2d,
+ 0x23, 0xaa, 0x07, 0x18, 0x0f, 0x76, 0xb4, 0xfd,
+ 0x29, 0xb9, 0xc0, 0x82, 0x30, 0x99, 0xc9, 0xd6,
+ 0x2c, 0x51, 0x98, 0x80, 0xae, 0xe7, 0xe9, 0x69,
+ 0x76, 0x17, 0xc1, 0x49, 0x7d, 0x47, 0xbf, 0x3e,
+ 0x57, 0x19, 0x50, 0x31, 0x14, 0x21, 0xb6, 0xb7,
+ 0x34, 0xd3, 0x8b, 0x0d, 0xb9, 0x1e, 0xb8, 0x53,
+ 0x31, 0xb9, 0x1e, 0xa9, 0xf6, 0x15, 0x30, 0xf5,
+ 0x45, 0x12, 0xa5, 0xa5, 0x2a, 0x4b, 0xad, 0x58,
+ 0x9e, 0xb6, 0x97, 0x81, 0xd5, 0x37, 0xf2, 0x32,
+ 0x97, 0xbb, 0x45, 0x9b, 0xda, 0xd2, 0x94, 0x8a,
+ 0x29, 0xe1, 0x55, 0x0b, 0xf4, 0x78, 0x7e, 0x0b,
+ 0xe9, 0x5b, 0xb1, 0x73, 0xcf, 0x5f, 0xab, 0x17,
+ 0xda, 0xb7, 0xa1, 0x3a, 0x05, 0x2a, 0x63, 0x45,
+ 0x3d, 0x97, 0xcc, 0xec, 0x1a, 0x32, 0x19, 0x54,
+ 0x88, 0x6b, 0x7a, 0x12, 0x99, 0xfa, 0xae, 0xec,
+ 0xae, 0x35, 0xc6, 0xea, 0xac, 0xa7, 0x53, 0xb0,
+ 0x41, 0xb5, 0xe5, 0xf0, 0x93, 0xbf, 0x83, 0x39,
+ 0x7f, 0xd2, 0x1d, 0xd6, 0xb3, 0x01, 0x20, 0x66,
+ 0xfc, 0xc0, 0x58, 0xcc, 0x32, 0xc3, 0xb0, 0x9d,
+ 0x75, 0x62, 0xde, 0xe2, 0x95, 0x09, 0xb5, 0x83,
+ 0x93, 0x92, 0xc9, 0xff, 0x05, 0xf5, 0x1f, 0x31,
+ 0x66, 0xaa, 0xac, 0x4a, 0xc5, 0xf2, 0x38, 0x03,
+ 0x8a, 0x30, 0x45, 0xe6, 0xf7, 0x2e, 0x48, 0xef,
+ 0x0f, 0xe8, 0xbc, 0x67, 0x5e, 0x82, 0xc3, 0x18,
+ 0xa2, 0x68, 0xe4, 0x39, 0x70, 0x27, 0x1b, 0xf1,
+ 0x19, 0xb8, 0x1b, 0xf6, 0xa9, 0x82, 0x74, 0x65,
+ 0x54, 0xf8, 0x4e, 0x72, 0xb9, 0xf0, 0x02, 0x80,
+ 0xa3, 0x20, 0xa0, 0x81, 0x42, 0x92, 0x3c, 0x23,
+ 0xc8, 0x83, 0x42, 0x3f, 0xf9, 0x49, 0x82, 0x7f,
+ 0x29, 0xbb, 0xac, 0xdc, 0x1c, 0xcd, 0xb0, 0x49,
+ 0x38, 0xce, 0x60, 0x98, 0xc9, 0x5b, 0xa6, 0xb3,
+ 0x25, 0x28, 0xf4, 0xef, 0x78, 0xee, 0xd7, 0x78,
+ 0xb2, 0xe1, 0x22, 0xdd, 0xfd, 0x1c, 0xbd, 0xd1,
+ 0x1d, 0x1c, 0x0a, 0x67, 0x83, 0xe0, 0x11, 0xfc,
+ 0x53, 0x6d, 0x63, 0xd0, 0x53, 0x26, 0x06, 0x37
+};
+
+static uint8_t v8_CTX[512] = {
+ 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59,
+ 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda,
+ 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44,
+ 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95,
+ 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1,
+ 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2,
+ 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e,
+ 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae,
+ 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e,
+ 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a,
+ 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27,
+ 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94,
+ 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c,
+ 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32,
+ 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18,
+ 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0,
+ 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d,
+ 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d,
+ 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf,
+ 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd,
+ 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1,
+ 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72,
+ 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb,
+ 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d,
+ 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6,
+ 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b,
+ 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58,
+ 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92,
+ 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf,
+ 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a,
+ 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29,
+ 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e,
+ 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28,
+ 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77,
+ 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87,
+ 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18,
+ 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1,
+ 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd,
+ 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67,
+ 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b,
+ 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde,
+ 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75,
+ 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a,
+ 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb,
+ 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32,
+ 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11,
+ 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65,
+ 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a,
+ 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca,
+ 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24,
+ 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e,
+ 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d,
+ 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05,
+ 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06,
+ 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9,
+ 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4,
+ 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11,
+ 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c,
+ 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85,
+ 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1,
+ 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11,
+ 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c,
+ 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef,
+ 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a
+};
+
+/*
+ * Vector 9
+ * Key1 27182818284590452353602874713526
+ * Key2 31415926535897932384626433832795
+ * Data Unit Sequence Number ff
+ * PTX 72efc1ebfe1ee25975a6eb3aa8589dda2b261f1c85bdab442a9e5b2dd1d7c395
+ * PTX 7a16fc08e526d4b1223f1b1232a11af274c3d70dac57f83e0983c498f1a6f1ae
+ * PTX cb021c3e70085a1e527f1ce41ee5911a82020161529cd82773762daf5459de94
+ * PTX a0a82adae7e1703c808543c29ed6fb32d9e004327c1355180c995a07741493a0
+ * PTX 9c21ba01a387882da4f62534b87bb15d60d197201c0fd3bf30c1500a3ecfecdd
+ * PTX 66d8721f90bcc4c17ee925c61b0a03727a9c0d5f5ca462fbfa0af1c2513a9d9d
+ * PTX 4b5345bd27a5f6e653f751693e6b6a2b8ead57d511e00e58c45b7b8d005af792
+ * PTX 88f5c7c22fd4f1bf7a898b03a5634c6a1ae3f9fae5de4f296a2896b23e7ed43e
+ * PTX d14fa5a2803f4d28f0d3ffcf24757677aebdb47bb388378708948a8d4126ed18
+ * PTX 39e0da29a537a8c198b3c66ab00712dd261674bf45a73d67f76914f830ca014b
+ * PTX 65596f27e4cf62de66125a5566df9975155628b400fbfb3a29040ed50faffdbb
+ * PTX 18aece7c5c44693260aab386c0a37b11b114f1c415aebb653be468179428d43a
+ * PTX 4d8bc3ec38813eca30a13cf1bb18d524f1992d44d8b1a42ea30b22e6c95b199d
+ * PTX 8d182f8840b09d059585c31ad691fa0619ff038aca2c39a943421157361717c4
+ * PTX 9d322028a74648113bd8c9d7ec77cf3c89c1ec8718ceff8516d96b34c3c614f1
+ * PTX 0699c9abc4ed0411506223bea16af35c883accdbe1104eef0cfdb54e12fb230a
+ * CTX 3260ae8dad1f4a32c5cafe3ab0eb95549d461a67ceb9e5aa2d3afb62dece0553
+ * CTX 193ba50c75be251e08d1d08f1088576c7efdfaaf3f459559571e12511753b07a
+ * CTX f073f35da06af0ce0bbf6b8f5ccc5cea500ec1b211bd51f63b606bf6528796ca
+ * CTX 12173ba39b8935ee44ccce646f90a45bf9ccc567f0ace13dc2d53ebeedc81f58
+ * CTX b2e41179dddf0d5a5c42f5d8506c1a5d2f8f59f3ea873cbcd0eec19acbf32542
+ * CTX 3bd3dcb8c2b1bf1d1eaed0eba7f0698e4314fbeb2f1566d1b9253008cbccf45a
+ * CTX 2b0d9c5c9c21474f4076e02be26050b99dee4fd68a4cf890e496e4fcae7b70f9
+ * CTX 4ea5a9062da0daeba1993d2ccd1dd3c244b8428801495a58b216547e7e847c46
+ * CTX d1d756377b6242d2e5fb83bf752b54e0df71e889f3a2bb0f4c10805bf3c59037
+ * CTX 6e3c24e22ff57f7fa965577375325cea5d920db94b9c336b455f6e894c01866f
+ * CTX e9fbb8c8d3f70a2957285f6dfb5dcd8cbf54782f8fe7766d4723819913ac7734
+ * CTX 21e3a31095866bad22c86a6036b2518b2059b4229d18c8c2ccbdf906c6cc6e82
+ * CTX 464ee57bddb0bebcb1dc645325bfb3e665ef7251082c88ebb1cf203bd779fdd3
+ * CTX 8675713c8daadd17e1cabee432b09787b6ddf3304e38b731b45df5df51b78fcf
+ * CTX b3d32466028d0ba36555e7e11ab0ee0666061d1645d962444bc47a38188930a8
+ * CTX 4b4d561395c73c087021927ca638b7afc8a8679ccb84c26555440ec7f10445cd
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v9_key1[16] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26
+};
+
+static uint8_t v9_key2[16] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95
+};
+
+static uint8_t v9_TW[16] = {
+ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v9_PTX[512] = {
+ 0x72, 0xef, 0xc1, 0xeb, 0xfe, 0x1e, 0xe2, 0x59,
+ 0x75, 0xa6, 0xeb, 0x3a, 0xa8, 0x58, 0x9d, 0xda,
+ 0x2b, 0x26, 0x1f, 0x1c, 0x85, 0xbd, 0xab, 0x44,
+ 0x2a, 0x9e, 0x5b, 0x2d, 0xd1, 0xd7, 0xc3, 0x95,
+ 0x7a, 0x16, 0xfc, 0x08, 0xe5, 0x26, 0xd4, 0xb1,
+ 0x22, 0x3f, 0x1b, 0x12, 0x32, 0xa1, 0x1a, 0xf2,
+ 0x74, 0xc3, 0xd7, 0x0d, 0xac, 0x57, 0xf8, 0x3e,
+ 0x09, 0x83, 0xc4, 0x98, 0xf1, 0xa6, 0xf1, 0xae,
+ 0xcb, 0x02, 0x1c, 0x3e, 0x70, 0x08, 0x5a, 0x1e,
+ 0x52, 0x7f, 0x1c, 0xe4, 0x1e, 0xe5, 0x91, 0x1a,
+ 0x82, 0x02, 0x01, 0x61, 0x52, 0x9c, 0xd8, 0x27,
+ 0x73, 0x76, 0x2d, 0xaf, 0x54, 0x59, 0xde, 0x94,
+ 0xa0, 0xa8, 0x2a, 0xda, 0xe7, 0xe1, 0x70, 0x3c,
+ 0x80, 0x85, 0x43, 0xc2, 0x9e, 0xd6, 0xfb, 0x32,
+ 0xd9, 0xe0, 0x04, 0x32, 0x7c, 0x13, 0x55, 0x18,
+ 0x0c, 0x99, 0x5a, 0x07, 0x74, 0x14, 0x93, 0xa0,
+ 0x9c, 0x21, 0xba, 0x01, 0xa3, 0x87, 0x88, 0x2d,
+ 0xa4, 0xf6, 0x25, 0x34, 0xb8, 0x7b, 0xb1, 0x5d,
+ 0x60, 0xd1, 0x97, 0x20, 0x1c, 0x0f, 0xd3, 0xbf,
+ 0x30, 0xc1, 0x50, 0x0a, 0x3e, 0xcf, 0xec, 0xdd,
+ 0x66, 0xd8, 0x72, 0x1f, 0x90, 0xbc, 0xc4, 0xc1,
+ 0x7e, 0xe9, 0x25, 0xc6, 0x1b, 0x0a, 0x03, 0x72,
+ 0x7a, 0x9c, 0x0d, 0x5f, 0x5c, 0xa4, 0x62, 0xfb,
+ 0xfa, 0x0a, 0xf1, 0xc2, 0x51, 0x3a, 0x9d, 0x9d,
+ 0x4b, 0x53, 0x45, 0xbd, 0x27, 0xa5, 0xf6, 0xe6,
+ 0x53, 0xf7, 0x51, 0x69, 0x3e, 0x6b, 0x6a, 0x2b,
+ 0x8e, 0xad, 0x57, 0xd5, 0x11, 0xe0, 0x0e, 0x58,
+ 0xc4, 0x5b, 0x7b, 0x8d, 0x00, 0x5a, 0xf7, 0x92,
+ 0x88, 0xf5, 0xc7, 0xc2, 0x2f, 0xd4, 0xf1, 0xbf,
+ 0x7a, 0x89, 0x8b, 0x03, 0xa5, 0x63, 0x4c, 0x6a,
+ 0x1a, 0xe3, 0xf9, 0xfa, 0xe5, 0xde, 0x4f, 0x29,
+ 0x6a, 0x28, 0x96, 0xb2, 0x3e, 0x7e, 0xd4, 0x3e,
+ 0xd1, 0x4f, 0xa5, 0xa2, 0x80, 0x3f, 0x4d, 0x28,
+ 0xf0, 0xd3, 0xff, 0xcf, 0x24, 0x75, 0x76, 0x77,
+ 0xae, 0xbd, 0xb4, 0x7b, 0xb3, 0x88, 0x37, 0x87,
+ 0x08, 0x94, 0x8a, 0x8d, 0x41, 0x26, 0xed, 0x18,
+ 0x39, 0xe0, 0xda, 0x29, 0xa5, 0x37, 0xa8, 0xc1,
+ 0x98, 0xb3, 0xc6, 0x6a, 0xb0, 0x07, 0x12, 0xdd,
+ 0x26, 0x16, 0x74, 0xbf, 0x45, 0xa7, 0x3d, 0x67,
+ 0xf7, 0x69, 0x14, 0xf8, 0x30, 0xca, 0x01, 0x4b,
+ 0x65, 0x59, 0x6f, 0x27, 0xe4, 0xcf, 0x62, 0xde,
+ 0x66, 0x12, 0x5a, 0x55, 0x66, 0xdf, 0x99, 0x75,
+ 0x15, 0x56, 0x28, 0xb4, 0x00, 0xfb, 0xfb, 0x3a,
+ 0x29, 0x04, 0x0e, 0xd5, 0x0f, 0xaf, 0xfd, 0xbb,
+ 0x18, 0xae, 0xce, 0x7c, 0x5c, 0x44, 0x69, 0x32,
+ 0x60, 0xaa, 0xb3, 0x86, 0xc0, 0xa3, 0x7b, 0x11,
+ 0xb1, 0x14, 0xf1, 0xc4, 0x15, 0xae, 0xbb, 0x65,
+ 0x3b, 0xe4, 0x68, 0x17, 0x94, 0x28, 0xd4, 0x3a,
+ 0x4d, 0x8b, 0xc3, 0xec, 0x38, 0x81, 0x3e, 0xca,
+ 0x30, 0xa1, 0x3c, 0xf1, 0xbb, 0x18, 0xd5, 0x24,
+ 0xf1, 0x99, 0x2d, 0x44, 0xd8, 0xb1, 0xa4, 0x2e,
+ 0xa3, 0x0b, 0x22, 0xe6, 0xc9, 0x5b, 0x19, 0x9d,
+ 0x8d, 0x18, 0x2f, 0x88, 0x40, 0xb0, 0x9d, 0x05,
+ 0x95, 0x85, 0xc3, 0x1a, 0xd6, 0x91, 0xfa, 0x06,
+ 0x19, 0xff, 0x03, 0x8a, 0xca, 0x2c, 0x39, 0xa9,
+ 0x43, 0x42, 0x11, 0x57, 0x36, 0x17, 0x17, 0xc4,
+ 0x9d, 0x32, 0x20, 0x28, 0xa7, 0x46, 0x48, 0x11,
+ 0x3b, 0xd8, 0xc9, 0xd7, 0xec, 0x77, 0xcf, 0x3c,
+ 0x89, 0xc1, 0xec, 0x87, 0x18, 0xce, 0xff, 0x85,
+ 0x16, 0xd9, 0x6b, 0x34, 0xc3, 0xc6, 0x14, 0xf1,
+ 0x06, 0x99, 0xc9, 0xab, 0xc4, 0xed, 0x04, 0x11,
+ 0x50, 0x62, 0x23, 0xbe, 0xa1, 0x6a, 0xf3, 0x5c,
+ 0x88, 0x3a, 0xcc, 0xdb, 0xe1, 0x10, 0x4e, 0xef,
+ 0x0c, 0xfd, 0xb5, 0x4e, 0x12, 0xfb, 0x23, 0x0a
+};
+
+static uint8_t v9_CTX[512] = {
+ 0x32, 0x60, 0xae, 0x8d, 0xad, 0x1f, 0x4a, 0x32,
+ 0xc5, 0xca, 0xfe, 0x3a, 0xb0, 0xeb, 0x95, 0x54,
+ 0x9d, 0x46, 0x1a, 0x67, 0xce, 0xb9, 0xe5, 0xaa,
+ 0x2d, 0x3a, 0xfb, 0x62, 0xde, 0xce, 0x05, 0x53,
+ 0x19, 0x3b, 0xa5, 0x0c, 0x75, 0xbe, 0x25, 0x1e,
+ 0x08, 0xd1, 0xd0, 0x8f, 0x10, 0x88, 0x57, 0x6c,
+ 0x7e, 0xfd, 0xfa, 0xaf, 0x3f, 0x45, 0x95, 0x59,
+ 0x57, 0x1e, 0x12, 0x51, 0x17, 0x53, 0xb0, 0x7a,
+ 0xf0, 0x73, 0xf3, 0x5d, 0xa0, 0x6a, 0xf0, 0xce,
+ 0x0b, 0xbf, 0x6b, 0x8f, 0x5c, 0xcc, 0x5c, 0xea,
+ 0x50, 0x0e, 0xc1, 0xb2, 0x11, 0xbd, 0x51, 0xf6,
+ 0x3b, 0x60, 0x6b, 0xf6, 0x52, 0x87, 0x96, 0xca,
+ 0x12, 0x17, 0x3b, 0xa3, 0x9b, 0x89, 0x35, 0xee,
+ 0x44, 0xcc, 0xce, 0x64, 0x6f, 0x90, 0xa4, 0x5b,
+ 0xf9, 0xcc, 0xc5, 0x67, 0xf0, 0xac, 0xe1, 0x3d,
+ 0xc2, 0xd5, 0x3e, 0xbe, 0xed, 0xc8, 0x1f, 0x58,
+ 0xb2, 0xe4, 0x11, 0x79, 0xdd, 0xdf, 0x0d, 0x5a,
+ 0x5c, 0x42, 0xf5, 0xd8, 0x50, 0x6c, 0x1a, 0x5d,
+ 0x2f, 0x8f, 0x59, 0xf3, 0xea, 0x87, 0x3c, 0xbc,
+ 0xd0, 0xee, 0xc1, 0x9a, 0xcb, 0xf3, 0x25, 0x42,
+ 0x3b, 0xd3, 0xdc, 0xb8, 0xc2, 0xb1, 0xbf, 0x1d,
+ 0x1e, 0xae, 0xd0, 0xeb, 0xa7, 0xf0, 0x69, 0x8e,
+ 0x43, 0x14, 0xfb, 0xeb, 0x2f, 0x15, 0x66, 0xd1,
+ 0xb9, 0x25, 0x30, 0x08, 0xcb, 0xcc, 0xf4, 0x5a,
+ 0x2b, 0x0d, 0x9c, 0x5c, 0x9c, 0x21, 0x47, 0x4f,
+ 0x40, 0x76, 0xe0, 0x2b, 0xe2, 0x60, 0x50, 0xb9,
+ 0x9d, 0xee, 0x4f, 0xd6, 0x8a, 0x4c, 0xf8, 0x90,
+ 0xe4, 0x96, 0xe4, 0xfc, 0xae, 0x7b, 0x70, 0xf9,
+ 0x4e, 0xa5, 0xa9, 0x06, 0x2d, 0xa0, 0xda, 0xeb,
+ 0xa1, 0x99, 0x3d, 0x2c, 0xcd, 0x1d, 0xd3, 0xc2,
+ 0x44, 0xb8, 0x42, 0x88, 0x01, 0x49, 0x5a, 0x58,
+ 0xb2, 0x16, 0x54, 0x7e, 0x7e, 0x84, 0x7c, 0x46,
+ 0xd1, 0xd7, 0x56, 0x37, 0x7b, 0x62, 0x42, 0xd2,
+ 0xe5, 0xfb, 0x83, 0xbf, 0x75, 0x2b, 0x54, 0xe0,
+ 0xdf, 0x71, 0xe8, 0x89, 0xf3, 0xa2, 0xbb, 0x0f,
+ 0x4c, 0x10, 0x80, 0x5b, 0xf3, 0xc5, 0x90, 0x37,
+ 0x6e, 0x3c, 0x24, 0xe2, 0x2f, 0xf5, 0x7f, 0x7f,
+ 0xa9, 0x65, 0x57, 0x73, 0x75, 0x32, 0x5c, 0xea,
+ 0x5d, 0x92, 0x0d, 0xb9, 0x4b, 0x9c, 0x33, 0x6b,
+ 0x45, 0x5f, 0x6e, 0x89, 0x4c, 0x01, 0x86, 0x6f,
+ 0xe9, 0xfb, 0xb8, 0xc8, 0xd3, 0xf7, 0x0a, 0x29,
+ 0x57, 0x28, 0x5f, 0x6d, 0xfb, 0x5d, 0xcd, 0x8c,
+ 0xbf, 0x54, 0x78, 0x2f, 0x8f, 0xe7, 0x76, 0x6d,
+ 0x47, 0x23, 0x81, 0x99, 0x13, 0xac, 0x77, 0x34,
+ 0x21, 0xe3, 0xa3, 0x10, 0x95, 0x86, 0x6b, 0xad,
+ 0x22, 0xc8, 0x6a, 0x60, 0x36, 0xb2, 0x51, 0x8b,
+ 0x20, 0x59, 0xb4, 0x22, 0x9d, 0x18, 0xc8, 0xc2,
+ 0xcc, 0xbd, 0xf9, 0x06, 0xc6, 0xcc, 0x6e, 0x82,
+ 0x46, 0x4e, 0xe5, 0x7b, 0xdd, 0xb0, 0xbe, 0xbc,
+ 0xb1, 0xdc, 0x64, 0x53, 0x25, 0xbf, 0xb3, 0xe6,
+ 0x65, 0xef, 0x72, 0x51, 0x08, 0x2c, 0x88, 0xeb,
+ 0xb1, 0xcf, 0x20, 0x3b, 0xd7, 0x79, 0xfd, 0xd3,
+ 0x86, 0x75, 0x71, 0x3c, 0x8d, 0xaa, 0xdd, 0x17,
+ 0xe1, 0xca, 0xbe, 0xe4, 0x32, 0xb0, 0x97, 0x87,
+ 0xb6, 0xdd, 0xf3, 0x30, 0x4e, 0x38, 0xb7, 0x31,
+ 0xb4, 0x5d, 0xf5, 0xdf, 0x51, 0xb7, 0x8f, 0xcf,
+ 0xb3, 0xd3, 0x24, 0x66, 0x02, 0x8d, 0x0b, 0xa3,
+ 0x65, 0x55, 0xe7, 0xe1, 0x1a, 0xb0, 0xee, 0x06,
+ 0x66, 0x06, 0x1d, 0x16, 0x45, 0xd9, 0x62, 0x44,
+ 0x4b, 0xc4, 0x7a, 0x38, 0x18, 0x89, 0x30, 0xa8,
+ 0x4b, 0x4d, 0x56, 0x13, 0x95, 0xc7, 0x3c, 0x08,
+ 0x70, 0x21, 0x92, 0x7c, 0xa6, 0x38, 0xb7, 0xaf,
+ 0xc8, 0xa8, 0x67, 0x9c, 0xcb, 0x84, 0xc2, 0x65,
+ 0x55, 0x44, 0x0e, 0xc7, 0xf1, 0x04, 0x45, 0xcd
+};
+
+/*
+ * Vector 15
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f10
+ * CTX 6c1625db4671522d3d7599601de7ca09ed
+ * Plaintext length (bytes): 17
+ */
+
+static uint8_t v15_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v15_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v15_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v15_PTX[17] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10
+};
+
+static uint8_t v15_CTX[17] = {
+ 0x6c, 0x16, 0x25, 0xdb, 0x46, 0x71, 0x52, 0x2d,
+ 0x3d, 0x75, 0x99, 0x60, 0x1d, 0xe7, 0xca, 0x09,
+ 0xed
+};
+
+/*
+ * Vector 16
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f1011
+ * CTX d069444b7a7e0cab09e24447d24deb1fedbf
+ * Plaintext length (bytes): 18
+ */
+static uint8_t v16_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v16_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v16_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v16_PTX[18] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11
+};
+
+static uint8_t v16_CTX[18] = {
+ 0xd0, 0x69, 0x44, 0x4b, 0x7a, 0x7e, 0x0c, 0xab,
+ 0x09, 0xe2, 0x44, 0x47, 0xd2, 0x4d, 0xeb, 0x1f,
+ 0xed, 0xbf
+};
+
+/*
+ * Vector 17
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f101112
+ * CTX e5df1351c0544ba1350b3363cd8ef4beedbf9d
+ * Plaintext length (bytes): 19
+ */
+
+static uint8_t v17_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v17_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v17_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v17_PTX[19] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12
+};
+
+static uint8_t v17_CTX[19] = {
+ 0xe5, 0xdf, 0x13, 0x51, 0xc0, 0x54, 0x4b, 0xa1,
+ 0x35, 0x0b, 0x33, 0x63, 0xcd, 0x8e, 0xf4, 0xbe,
+ 0xed, 0xbf, 0x9d
+};
+
+/*
+ * Vector 18
+ * Key1 fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ * Key2 bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0
+ * Data unit sequence number 9a78563412
+ * PTX 000102030405060708090a0b0c0d0e0f10111213
+ * CTX 9d84c813f719aa2c7be3f66171c7c5c2edbf9dac
+ * Plaintext length (bytes): 20
+ */
+
+static uint8_t v18_key1[16] = {
+ 0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8,
+ 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0xf0
+};
+
+static uint8_t v18_key2[16] = {
+ 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
+ 0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0
+};
+
+static uint8_t v18_TW[16] = {
+ 0x9a, 0x78, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v18_PTX[20] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13
+};
+
+static uint8_t v18_CTX[20] = {
+ 0x9d, 0x84, 0xc8, 0x13, 0xf7, 0x19, 0xaa, 0x2c,
+ 0x7b, 0xe3, 0xf6, 0x61, 0x71, 0xc7, 0xc5, 0xc2,
+ 0xed, 0xbf, 0x9d, 0xac
+};
+
+/*
+ * Vector 19
+ * Key1 e0e1e2e3e4e5e6e7e8e9eaebecedeeef
+ * Key2 c0c1c2c3c4c5c6c7c8c9cacbcccdcecf
+ * Data unit sequence number 21436587a9
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 38b45812ef43a05bd957e545907e223b954ab4aaf088303ad910eadf14b42be6
+ * CTX 8b2461149d8c8ba85f992be970bc621f1b06573f63e867bf5875acafa04e42cc
+ * CTX bd7bd3c2a0fb1fff791ec5ec36c66ae4ac1e806d81fbf709dbe29e471fad3854
+ * CTX 9c8e66f5345d7c1eb94f405d1ec785cc6f6a68f6254dd8339f9d84057e01a177
+ * CTX 41990482999516b5611a38f41bb6478e6f173f320805dd71b1932fc333cb9ee3
+ * CTX 9936beea9ad96fa10fb4112b901734ddad40bc1878995f8e11aee7d141a2f5d4
+ * CTX 8b7a4e1e7f0b2c04830e69a4fd1378411c2f287edf48c6c4e5c247a19680f7fe
+ * CTX 41cefbd49b582106e3616cbbe4dfb2344b2ae9519391f3e0fb4922254b1d6d2d
+ * CTX 19c6d4d537b3a26f3bcc51588b32f3eca0829b6a5ac72578fb814fb43cf80d64
+ * CTX a233e3f997a3f02683342f2b33d25b492536b93becb2f5e1a8b82f5b88334272
+ * CTX 9e8ae09d16938841a21a97fb543eea3bbff59f13c1a18449e398701c1ad51648
+ * CTX 346cbc04c27bb2da3b93a1372ccae548fb53bee476f9e9c91773b1bb19828394
+ * CTX d55d3e1a20ed69113a860b6829ffa847224604435070221b257e8dff783615d2
+ * CTX cae4803a93aa4334ab482a0afac9c0aeda70b45a481df5dec5df8cc0f423c77a
+ * CTX 5fd46cd312021d4b438862419a791be03bb4d97c0e59578542531ba466a83baf
+ * CTX 92cefc151b5cc1611a167893819b63fb8a6b18e86de60290fa72b797b0ce59f3
+ * Plaintext length (bytes): 512
+ */
+static uint8_t v19_key1[16] = {
+
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef
+};
+
+static uint8_t v19_key2[16] = {
+
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf
+};
+
+static uint8_t v19_TW[16] = {
+
+ 0x21, 0x43, 0x65, 0x87, 0xa9, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v19_PTX[512] = {
+
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v19_CTX[512] = {
+ 0x38, 0xb4, 0x58, 0x12, 0xef, 0x43, 0xa0, 0x5b,
+ 0xd9, 0x57, 0xe5, 0x45, 0x90, 0x7e, 0x22, 0x3b,
+ 0x95, 0x4a, 0xb4, 0xaa, 0xf0, 0x88, 0x30, 0x3a,
+ 0xd9, 0x10, 0xea, 0xdf, 0x14, 0xb4, 0x2b, 0xe6,
+ 0x8b, 0x24, 0x61, 0x14, 0x9d, 0x8c, 0x8b, 0xa8,
+ 0x5f, 0x99, 0x2b, 0xe9, 0x70, 0xbc, 0x62, 0x1f,
+ 0x1b, 0x06, 0x57, 0x3f, 0x63, 0xe8, 0x67, 0xbf,
+ 0x58, 0x75, 0xac, 0xaf, 0xa0, 0x4e, 0x42, 0xcc,
+ 0xbd, 0x7b, 0xd3, 0xc2, 0xa0, 0xfb, 0x1f, 0xff,
+ 0x79, 0x1e, 0xc5, 0xec, 0x36, 0xc6, 0x6a, 0xe4,
+ 0xac, 0x1e, 0x80, 0x6d, 0x81, 0xfb, 0xf7, 0x09,
+ 0xdb, 0xe2, 0x9e, 0x47, 0x1f, 0xad, 0x38, 0x54,
+ 0x9c, 0x8e, 0x66, 0xf5, 0x34, 0x5d, 0x7c, 0x1e,
+ 0xb9, 0x4f, 0x40, 0x5d, 0x1e, 0xc7, 0x85, 0xcc,
+ 0x6f, 0x6a, 0x68, 0xf6, 0x25, 0x4d, 0xd8, 0x33,
+ 0x9f, 0x9d, 0x84, 0x05, 0x7e, 0x01, 0xa1, 0x77,
+ 0x41, 0x99, 0x04, 0x82, 0x99, 0x95, 0x16, 0xb5,
+ 0x61, 0x1a, 0x38, 0xf4, 0x1b, 0xb6, 0x47, 0x8e,
+ 0x6f, 0x17, 0x3f, 0x32, 0x08, 0x05, 0xdd, 0x71,
+ 0xb1, 0x93, 0x2f, 0xc3, 0x33, 0xcb, 0x9e, 0xe3,
+ 0x99, 0x36, 0xbe, 0xea, 0x9a, 0xd9, 0x6f, 0xa1,
+ 0x0f, 0xb4, 0x11, 0x2b, 0x90, 0x17, 0x34, 0xdd,
+ 0xad, 0x40, 0xbc, 0x18, 0x78, 0x99, 0x5f, 0x8e,
+ 0x11, 0xae, 0xe7, 0xd1, 0x41, 0xa2, 0xf5, 0xd4,
+ 0x8b, 0x7a, 0x4e, 0x1e, 0x7f, 0x0b, 0x2c, 0x04,
+ 0x83, 0x0e, 0x69, 0xa4, 0xfd, 0x13, 0x78, 0x41,
+ 0x1c, 0x2f, 0x28, 0x7e, 0xdf, 0x48, 0xc6, 0xc4,
+ 0xe5, 0xc2, 0x47, 0xa1, 0x96, 0x80, 0xf7, 0xfe,
+ 0x41, 0xce, 0xfb, 0xd4, 0x9b, 0x58, 0x21, 0x06,
+ 0xe3, 0x61, 0x6c, 0xbb, 0xe4, 0xdf, 0xb2, 0x34,
+ 0x4b, 0x2a, 0xe9, 0x51, 0x93, 0x91, 0xf3, 0xe0,
+ 0xfb, 0x49, 0x22, 0x25, 0x4b, 0x1d, 0x6d, 0x2d,
+ 0x19, 0xc6, 0xd4, 0xd5, 0x37, 0xb3, 0xa2, 0x6f,
+ 0x3b, 0xcc, 0x51, 0x58, 0x8b, 0x32, 0xf3, 0xec,
+ 0xa0, 0x82, 0x9b, 0x6a, 0x5a, 0xc7, 0x25, 0x78,
+ 0xfb, 0x81, 0x4f, 0xb4, 0x3c, 0xf8, 0x0d, 0x64,
+ 0xa2, 0x33, 0xe3, 0xf9, 0x97, 0xa3, 0xf0, 0x26,
+ 0x83, 0x34, 0x2f, 0x2b, 0x33, 0xd2, 0x5b, 0x49,
+ 0x25, 0x36, 0xb9, 0x3b, 0xec, 0xb2, 0xf5, 0xe1,
+ 0xa8, 0xb8, 0x2f, 0x5b, 0x88, 0x33, 0x42, 0x72,
+ 0x9e, 0x8a, 0xe0, 0x9d, 0x16, 0x93, 0x88, 0x41,
+ 0xa2, 0x1a, 0x97, 0xfb, 0x54, 0x3e, 0xea, 0x3b,
+ 0xbf, 0xf5, 0x9f, 0x13, 0xc1, 0xa1, 0x84, 0x49,
+ 0xe3, 0x98, 0x70, 0x1c, 0x1a, 0xd5, 0x16, 0x48,
+ 0x34, 0x6c, 0xbc, 0x04, 0xc2, 0x7b, 0xb2, 0xda,
+ 0x3b, 0x93, 0xa1, 0x37, 0x2c, 0xca, 0xe5, 0x48,
+ 0xfb, 0x53, 0xbe, 0xe4, 0x76, 0xf9, 0xe9, 0xc9,
+ 0x17, 0x73, 0xb1, 0xbb, 0x19, 0x82, 0x83, 0x94,
+ 0xd5, 0x5d, 0x3e, 0x1a, 0x20, 0xed, 0x69, 0x11,
+ 0x3a, 0x86, 0x0b, 0x68, 0x29, 0xff, 0xa8, 0x47,
+ 0x22, 0x46, 0x04, 0x43, 0x50, 0x70, 0x22, 0x1b,
+ 0x25, 0x7e, 0x8d, 0xff, 0x78, 0x36, 0x15, 0xd2,
+ 0xca, 0xe4, 0x80, 0x3a, 0x93, 0xaa, 0x43, 0x34,
+ 0xab, 0x48, 0x2a, 0x0a, 0xfa, 0xc9, 0xc0, 0xae,
+ 0xda, 0x70, 0xb4, 0x5a, 0x48, 0x1d, 0xf5, 0xde,
+ 0xc5, 0xdf, 0x8c, 0xc0, 0xf4, 0x23, 0xc7, 0x7a,
+ 0x5f, 0xd4, 0x6c, 0xd3, 0x12, 0x02, 0x1d, 0x4b,
+ 0x43, 0x88, 0x62, 0x41, 0x9a, 0x79, 0x1b, 0xe0,
+ 0x3b, 0xb4, 0xd9, 0x7c, 0x0e, 0x59, 0x57, 0x85,
+ 0x42, 0x53, 0x1b, 0xa4, 0x66, 0xa8, 0x3b, 0xaf,
+ 0x92, 0xce, 0xfc, 0x15, 0x1b, 0x5c, 0xc1, 0x61,
+ 0x1a, 0x16, 0x78, 0x93, 0x81, 0x9b, 0x63, 0xfb,
+ 0x8a, 0x6b, 0x18, 0xe8, 0x6d, 0xe6, 0x02, 0x90,
+ 0xfa, 0x72, 0xb7, 0x97, 0xb0, 0xce, 0x59, 0xf3
+};
+
+// Define vector of structs, with pointers to the statically defined vectors
+
+struct xts_vector vlist[NVEC] = {
+
+ // pointers to the statically defined vectors here
+
+ // Vector 1
+ {sizeof(v1_CTX), v1_key1, v1_key2, v1_TW, v1_PTX, v1_CTX}
+ ,
+ // Vector 2
+ {sizeof(v2_CTX), v2_key1, v2_key2, v2_TW, v2_PTX, v2_CTX}
+ ,
+ // Vector 3
+ {sizeof(v3_CTX), v3_key1, v3_key2, v3_TW, v3_PTX, v3_CTX}
+ ,
+ // Vector 4
+ {sizeof(v4_CTX), v4_key1, v4_key2, v4_TW, v4_PTX, v4_CTX}
+ ,
+ // Vector 5
+ {sizeof(v5_CTX), v5_key1, v5_key2, v5_TW, v5_PTX, v5_CTX}
+ ,
+ // Vector 6
+ {sizeof(v6_CTX), v6_key1, v6_key2, v6_TW, v6_PTX, v6_CTX}
+ ,
+ // Vector 7
+ {sizeof(v7_CTX), v7_key1, v7_key2, v7_TW, v7_PTX, v7_CTX}
+ ,
+ // Vector 8
+ {sizeof(v8_CTX), v8_key1, v8_key2, v8_TW, v8_PTX, v8_CTX}
+ ,
+ // Vector 9
+ {sizeof(v9_CTX), v9_key1, v9_key2, v9_TW, v9_PTX, v9_CTX}
+ ,
+ // Vector 15
+ {sizeof(v15_CTX), v15_key1, v15_key2, v15_TW, v15_PTX, v15_CTX}
+ ,
+ // Vector 16
+ {sizeof(v16_CTX), v16_key1, v16_key2, v16_TW, v16_PTX, v16_CTX}
+ ,
+ // Vector 17
+ {sizeof(v17_CTX), v17_key1, v17_key2, v17_TW, v17_PTX, v17_CTX}
+ ,
+ // Vector 18
+ {sizeof(v18_CTX), v18_key1, v18_key2, v18_TW, v18_PTX, v18_CTX}
+ ,
+ // Vector 19
+ {sizeof(v19_CTX), v19_key1, v19_key2, v19_TW, v19_PTX, v19_CTX}
+
+};
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c
new file mode 100644
index 000000000..a3b3fc12e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_ossl_perf.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *dt, *refdt;
+ struct perf start, stop;
+ unsigned char keyssl[64]; /* SSL takes both keys together */
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ printf("aes_xts_256_dec_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 32; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 32] = key2[i];
+ }
+
+ /* Encrypt and compare decrypted output */
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_256_xts_dec(&ctx, keyssl, tinit, TEST_LEN, ct, refdt);
+ if (memcmp(dt, refdt, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ perf_stop(&stop);
+ printf("aes_xts_256_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL decryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_256_xts_dec(&ctx, keyssl, tinit, TEST_LEN, ct, refdt);
+ perf_stop(&stop);
+ printf("aes_xts_256_openssl_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c
new file mode 100644
index 000000000..30dce6950
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_dec_perf.c
@@ -0,0 +1,103 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ printf("aes_xts_256_dec_perf:\n");
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_256_dec" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c
new file mode 100644
index 000000000..6431c27c9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_ossl_perf.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+#include <openssl/evp.h>
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+static inline
+ int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+ int i;
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *refct;
+ struct perf start, stop;
+ unsigned char keyssl[64]; /* SSL takes both keys together */
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ printf("aes_xts_256_enc_perf:\n");
+
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == refct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (i = 0; i < 32; i++) {
+ keyssl[i] = key1[i];
+ keyssl[i + 32] = key2[i];
+ }
+
+ /* Encrypt and compare output */
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_256_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ if (memcmp(ct, refct, TEST_LEN)) {
+ printf("ISA-L and OpenSSL results don't match\n");
+ return -1;
+ }
+
+ /* Time ISA-L encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ perf_stop(&stop);
+
+ printf("aes_xts_256_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ /* Time OpenSSL encryption */
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++)
+ openssl_aes_256_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+ perf_stop(&stop);
+
+ printf("aes_xts_256_ossl_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c
new file mode 100644
index 000000000..ac536e212
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_enc_perf.c
@@ -0,0 +1,101 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include "aes_xts.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Cached test, loop many times over small dataset
+# define TEST_LEN 8*1024
+# define TEST_LOOPS 400000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (2 * GT_L3_CACHE)
+# define TEST_LOOPS 50
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int i;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct;
+
+ printf("aes_xts_256_enc_perf:\n");
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+
+ struct perf start, stop;
+
+ perf_start(&start);
+
+ for (i = 0; i < TEST_LOOPS; i++) {
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ }
+
+ perf_stop(&stop);
+
+ printf("aes_xts_256_enc" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i);
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c
new file mode 100644
index 000000000..8278a226f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_expanded_key_test.c
@@ -0,0 +1,113 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <aes_keyexp.h>
+#include "xts_256_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+ // Arrays for expanded keys, null_key is a dummy vector (decrypt key not
+ // needed for the tweak part of the decryption)
+ uint8_t expkey1_enc[16 * 15], expkey2_enc[16 * 15];
+ uint8_t expkey1_dec[16 * 15], null_key[16 * 15];
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ printf("Can't allocate ciphertext memory\n");
+ return -1;
+ }
+ // Pre-expand our keys (will only use the encryption ones here)
+ aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key);
+
+ XTS_AES_256_enc_expanded_key(expkey2_enc, expkey1_enc, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ printf("\nXTS_AES_256_enc: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated plaintext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ printf("Can't allocate plaintext memory\n");
+ return -1;
+ }
+ // Pre-expand keys for the decryption
+ aes_keyexp_256(vlist[i].key1, expkey1_enc, expkey1_dec);
+ aes_keyexp_256(vlist[i].key2, expkey2_enc, null_key);
+
+ // Note, encryption key is re-used for the tweak decryption step
+ XTS_AES_256_dec_expanded_key(expkey2_enc, expkey1_dec, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ printf("\nXTS_AES_256_dec: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c
new file mode 100644
index 000000000..bfa10b6b6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h> // for rand
+#include <string.h> // for memcmp
+#include <aes_xts.h>
+#include <aes_keyexp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_SIZE (4096)
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+int main(void)
+{
+ int t, n;
+
+ unsigned char key1[16 * 2], key2[16 * 2], tinit[16];
+ unsigned char *pt, *ct, *dt;
+
+ int align, size, min_size;
+ unsigned char *efence_pt;
+ unsigned char *efence_ct;
+ unsigned char *efence_dt;
+
+ unsigned char *origin_pt;
+ unsigned char *origin_ct;
+ unsigned char *origin_dt;
+
+ unsigned char key1_exp_enc[16 * 15], key1_exp_dec[16 * 15];
+ unsigned char key2_exp_tw[16 * 15];
+ int i;
+ printf("aes_xts_256 enc/dec rand test, %d sets of %d max: ", RANDOMS, TEST_LEN);
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+
+ if (memcmp(pt, dt, TEST_LEN)) {
+ printf("fail\n");
+ return -1;
+ }
+ putchar('.');
+
+ // Do tests with random data, keys and message size
+ for (t = 0; t < RANDOMS; t++) {
+ n = rand() % (TEST_LEN);
+ if (n < 17)
+ continue;
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, n);
+ XTS_AES_256_enc(key2, key1, tinit, n, pt, ct);
+ XTS_AES_256_dec(key2, key1, tinit, n, ct, dt);
+
+ if (memcmp(pt, dt, n)) {
+ printf("fail rand %d, size %d\n", t, n);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // Run tests at end of buffer for Electric Fence
+ align = 1;
+ min_size = 16;
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ origin_pt = malloc(TEST_LEN);
+ origin_ct = malloc(TEST_LEN);
+ origin_dt = malloc(TEST_LEN);
+ if (NULL == origin_pt || NULL == origin_ct || NULL == origin_dt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ XTS_AES_256_enc(key2, key1, tinit, TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec(key2, key1, tinit, TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ for (i = 0; i < 16 * 15; i++) {
+ key2_exp_tw[i] = rand();
+ }
+
+ for (size = 0; size <= TEST_SIZE - min_size; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, efence_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ putchar('.');
+ fflush(0);
+ }
+
+ // For data lengths from 0 to 15 bytes, the functions return without any error
+ // codes, without reading or writing any data.
+ for (size = TEST_SIZE - min_size + align; size <= TEST_SIZE; size += align) {
+
+ // Line up TEST_SIZE from end
+ efence_pt = pt + TEST_LEN - TEST_SIZE + size;
+ efence_ct = ct + TEST_LEN - TEST_SIZE + size;
+ efence_dt = dt + TEST_LEN - TEST_SIZE + size;
+
+ xts256_mk_rand_data(key1, key2, tinit, efence_pt, TEST_SIZE - size);
+ memcpy(efence_ct, efence_pt, TEST_SIZE - size);
+ memcpy(efence_dt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_pt, efence_pt, TEST_SIZE - size);
+ memcpy(origin_ct, efence_ct, TEST_SIZE - size);
+ memcpy(origin_dt, efence_dt, TEST_SIZE - size);
+
+ aes_keyexp_256(key1, key1_exp_enc, key1_exp_dec);
+
+ XTS_AES_256_enc_expanded_key(key2_exp_tw, key1_exp_enc, tinit,
+ TEST_SIZE - size, efence_pt, efence_ct);
+ XTS_AES_256_dec_expanded_key(key2_exp_tw, key1_exp_dec, tinit,
+ TEST_SIZE - size, efence_ct, efence_dt);
+
+ if (memcmp(efence_pt, origin_pt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for pt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_ct, origin_ct, TEST_SIZE - size)) {
+ printf("efence_expanded_key for ct: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+ if (memcmp(efence_dt, origin_dt, TEST_SIZE - size)) {
+ printf("efence_expanded_key for dt: fail size %d\n", TEST_SIZE - size);
+ return -1;
+ }
+
+ putchar('.');
+ fflush(0);
+ }
+
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c
new file mode 100644
index 000000000..99308b37e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_rand_ossl_test.c
@@ -0,0 +1,209 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+#include <stdlib.h>
+#include <openssl/evp.h>
+
+#define TEST_LEN (1024*1024)
+#define TEST_LOOPS 100
+#ifndef RANDOMS
+# define RANDOMS 100
+#endif
+
+/* Generates random data for keys, tweak and plaintext */
+void xts256_mk_rand_data(unsigned char *k1, unsigned char *k2, unsigned char *t,
+ unsigned char *p, int n)
+{
+ int i;
+ for (i = 0; i < 32; i++) {
+ *k1++ = rand();
+ *k2++ = rand();
+ }
+ for (i = 0; i < 16; i++)
+ *t++ = rand();
+
+ for (i = 0; i < n; i++)
+ *p++ = rand();
+
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 256 encryption */
+static inline
+ int openssl_aes_256_xts_enc(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *pt, unsigned char *ct)
+{
+ int outlen, tmplen;
+ if (!EVP_EncryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptUpdate(ctx, ct, &outlen, (const unsigned char *)pt, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_EncryptFinal_ex(ctx, ct + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+/* Wrapper for OpenSSL EVP AES-XTS 256 decryption */
+static inline
+ int openssl_aes_256_xts_dec(EVP_CIPHER_CTX * ctx, unsigned char *key, unsigned char *iv,
+ int len, unsigned char *ct, unsigned char *dt)
+{
+ int outlen, tmplen;
+ if (!EVP_DecryptInit_ex(ctx, EVP_aes_256_xts(), NULL, key, iv))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptUpdate(ctx, dt, &outlen, (const unsigned char *)ct, len))
+ printf("\n ERROR!! \n");
+ if (!EVP_DecryptFinal_ex(ctx, dt + outlen, &tmplen))
+ printf("\n ERROR!! \n");
+
+ return 0;
+}
+
+int main(void)
+{
+
+ unsigned char key1[32], key2[32], tinit[16];
+ unsigned char *pt, *ct, *dt, *refct, *refdt;
+ unsigned char keyssl[64]; /* SSL takes both keys together */
+ int i, j, k;
+
+ /* Initialise our cipher context, which can use same input vectors */
+ EVP_CIPHER_CTX ctx;
+ EVP_CIPHER_CTX_init(&ctx);
+
+ /* Allocate space for input and output buffers */
+ pt = malloc(TEST_LEN);
+ ct = malloc(TEST_LEN);
+ dt = malloc(TEST_LEN);
+ refct = malloc(TEST_LEN);
+ refdt = malloc(TEST_LEN);
+
+ if (NULL == pt || NULL == ct || NULL == dt || NULL == refct || NULL == refdt) {
+ printf("malloc of testsize failed\n");
+ return -1;
+ }
+
+ /**************************** FIXED LENGTH TEST *************************/
+ printf("aes_xts_256_rand_ossl test, %d sets of length %d: ", TEST_LOOPS, TEST_LEN);
+
+ /* Loop over the vectors */
+ for (i = 0; i < TEST_LOOPS; i++) {
+
+ xts256_mk_rand_data(key1, key2, tinit, pt, TEST_LEN);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 32; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 32] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_256_enc(key2, key1, tinit, TEST_LEN, pt, ct);
+ openssl_aes_256_xts_enc(&ctx, keyssl, tinit, TEST_LEN, pt, refct);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_256_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_256_dec(key2, key1, tinit, TEST_LEN, ct, dt);
+ openssl_aes_256_xts_dec(&ctx, keyssl, tinit, TEST_LEN, refct, refdt);
+
+ for (j = 0; j < TEST_LEN; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_256_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ /**************************** RANDOM LENGTH TEST *************************/
+ printf("aes_xts_256_rand_ossl test, %d sets of random lengths: ", RANDOMS);
+
+ /* Run tests with random size */
+
+ unsigned int rand_len, t;
+
+ for (t = 0; t < RANDOMS; t++) {
+
+ rand_len = rand() % (TEST_LEN);
+ xts256_mk_rand_data(key1, key2, tinit, pt, rand_len);
+
+ /* Set up key for the SSL engine */
+ for (k = 0; k < 32; k++) {
+ keyssl[k] = key1[k];
+ keyssl[k + 32] = key2[k];
+ }
+
+ /* Encrypt using each method */
+ XTS_AES_256_enc(key2, key1, tinit, rand_len, pt, ct);
+ openssl_aes_256_xts_enc(&ctx, keyssl, tinit, rand_len, pt, refct);
+
+ /* Carry out comparison of the calculated ciphertext with
+ * the reference
+ */
+ for (j = 0; j < rand_len; j++) {
+
+ if (ct[j] != refct[j]) {
+ printf("XTS_AES_256_enc failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+
+ /* Decrypt using each method */
+ XTS_AES_256_dec(key2, key1, tinit, rand_len, ct, dt);
+ openssl_aes_256_xts_dec(&ctx, keyssl, tinit, rand_len, refct, refdt);
+
+ for (j = 0; j < rand_len; j++) {
+
+ if (dt[j] != refdt[j]) {
+ printf("XTS_AES_256_dec failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+ fflush(0);
+ }
+ printf("Pass\n");
+
+ printf("aes_xts_256_rand_ossl: All tests passed\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c
new file mode 100644
index 000000000..fc349524e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_test.c
@@ -0,0 +1,105 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "xts_256_vect.h"
+
+int main(void)
+{
+
+ // Temporary array for the calculated vectors
+ uint8_t *ct_test;
+ uint8_t *pt_test;
+
+ int i, j;
+
+ // --- Encryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ ct_test = malloc(vlist[i].ptlen);
+ if (ct_test == NULL) {
+ fprintf(stderr, "Can't allocate ciphertext memory\n");
+ return -1;
+ }
+
+ XTS_AES_256_enc(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].PTX, ct_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (ct_test[j] != vlist[i].CTX[j]) {
+ printf("\nXTS_AES_256_enc: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+
+ ct_test = NULL;
+ }
+
+ // --- Decryption test ---
+
+ // Loop over the vectors
+ for (i = 0; i < NVEC; i++) {
+
+ // Allocate space for the calculated ciphertext
+ pt_test = malloc(vlist[i].ptlen);
+ if (pt_test == NULL) {
+ fprintf(stderr, "Can't allocate plaintext memory\n");
+ return -1;
+ }
+
+ XTS_AES_256_dec(vlist[i].key2, vlist[i].key1, vlist[i].TW,
+ vlist[i].ptlen, vlist[i].CTX, pt_test);
+
+ // Carry out comparison of the calculated ciphertext with
+ // the reference
+ for (j = 0; j < vlist[i].ptlen; j++) {
+
+ if (pt_test[j] != vlist[i].PTX[j]) {
+ printf("\nXTS_AES_256_dec: Vector %d: ", i + 10);
+ printf("failed at byte %d! \n", j);
+ return -1;
+ }
+ }
+ printf(".");
+
+ pt_test = NULL;
+ }
+ printf("Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h
new file mode 100644
index 000000000..640d1ddf6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_256_vect.h
@@ -0,0 +1,1035 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "aes_xts.h"
+
+#define NVEC 5
+
+// struct to hold pointers to the key, plaintext and ciphertext vectors
+struct xts_vector {
+ uint64_t ptlen; // length of our plaintext
+ uint8_t *key1; // dimension 16 for 128 bit aes
+ uint8_t *key2; // dimension 16 for 128 bit aes
+ uint8_t *TW; // dimension 16 for both 128 and 256 bit
+ uint8_t *PTX; // min. dimension 16
+ uint8_t *CTX; // same dimension as PTX
+};
+
+/* Define our test vectors statically here. Test vectors are from the standard:
+ * "IEEE Standard for Cryptographic Protection of Data on Block-Oriented
+ * Storage Devices"
+ * http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=4493450
+ *
+ * Vector 10
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 1c3b3a102f770386e4836c99e370cf9bea00803f5e482357a4ae12d414a3e63b
+ * CTX 5d31e276f8fe4a8d66b317f9ac683f44680a86ac35adfc3345befecb4bb188fd
+ * CTX 5776926c49a3095eb108fd1098baec70aaa66999a72a82f27d848b21d4a741b0
+ * CTX c5cd4d5fff9dac89aeba122961d03a757123e9870f8acf1000020887891429ca
+ * CTX 2a3e7a7d7df7b10355165c8b9a6d0a7de8b062c4500dc4cd120c0f7418dae3d0
+ * CTX b5781c34803fa75421c790dfe1de1834f280d7667b327f6c8cd7557e12ac3a0f
+ * CTX 93ec05c52e0493ef31a12d3d9260f79a289d6a379bc70c50841473d1a8cc81ec
+ * CTX 583e9645e07b8d9670655ba5bbcfecc6dc3966380ad8fecb17b6ba02469a020a
+ * CTX 84e18e8f84252070c13e9f1f289be54fbc481457778f616015e1327a02b140f1
+ * CTX 505eb309326d68378f8374595c849d84f4c333ec4423885143cb47bd71c5edae
+ * CTX 9be69a2ffeceb1bec9de244fbe15992b11b77c040f12bd8f6a975a44a0f90c29
+ * CTX a9abc3d4d893927284c58754cce294529f8614dcd2aba991925fedc4ae74ffac
+ * CTX 6e333b93eb4aff0479da9a410e4450e0dd7ae4c6e2910900575da401fc07059f
+ * CTX 645e8b7e9bfdef33943054ff84011493c27b3429eaedb4ed5376441a77ed4385
+ * CTX 1ad77f16f541dfd269d50d6a5f14fb0aab1cbb4c1550be97f7ab4066193c4caa
+ * CTX 773dad38014bd2092fa755c824bb5e54c4f36ffda9fcea70b9c6e693e148c151
+ * Plaintext length (bytes): 512
+ */
+
+static uint8_t v10_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v10_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v10_TW[16] = {
+ 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v10_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v10_CTX[512] = {
+ 0x1c, 0x3b, 0x3a, 0x10, 0x2f, 0x77, 0x03, 0x86,
+ 0xe4, 0x83, 0x6c, 0x99, 0xe3, 0x70, 0xcf, 0x9b,
+ 0xea, 0x00, 0x80, 0x3f, 0x5e, 0x48, 0x23, 0x57,
+ 0xa4, 0xae, 0x12, 0xd4, 0x14, 0xa3, 0xe6, 0x3b,
+ 0x5d, 0x31, 0xe2, 0x76, 0xf8, 0xfe, 0x4a, 0x8d,
+ 0x66, 0xb3, 0x17, 0xf9, 0xac, 0x68, 0x3f, 0x44,
+ 0x68, 0x0a, 0x86, 0xac, 0x35, 0xad, 0xfc, 0x33,
+ 0x45, 0xbe, 0xfe, 0xcb, 0x4b, 0xb1, 0x88, 0xfd,
+ 0x57, 0x76, 0x92, 0x6c, 0x49, 0xa3, 0x09, 0x5e,
+ 0xb1, 0x08, 0xfd, 0x10, 0x98, 0xba, 0xec, 0x70,
+ 0xaa, 0xa6, 0x69, 0x99, 0xa7, 0x2a, 0x82, 0xf2,
+ 0x7d, 0x84, 0x8b, 0x21, 0xd4, 0xa7, 0x41, 0xb0,
+ 0xc5, 0xcd, 0x4d, 0x5f, 0xff, 0x9d, 0xac, 0x89,
+ 0xae, 0xba, 0x12, 0x29, 0x61, 0xd0, 0x3a, 0x75,
+ 0x71, 0x23, 0xe9, 0x87, 0x0f, 0x8a, 0xcf, 0x10,
+ 0x00, 0x02, 0x08, 0x87, 0x89, 0x14, 0x29, 0xca,
+ 0x2a, 0x3e, 0x7a, 0x7d, 0x7d, 0xf7, 0xb1, 0x03,
+ 0x55, 0x16, 0x5c, 0x8b, 0x9a, 0x6d, 0x0a, 0x7d,
+ 0xe8, 0xb0, 0x62, 0xc4, 0x50, 0x0d, 0xc4, 0xcd,
+ 0x12, 0x0c, 0x0f, 0x74, 0x18, 0xda, 0xe3, 0xd0,
+ 0xb5, 0x78, 0x1c, 0x34, 0x80, 0x3f, 0xa7, 0x54,
+ 0x21, 0xc7, 0x90, 0xdf, 0xe1, 0xde, 0x18, 0x34,
+ 0xf2, 0x80, 0xd7, 0x66, 0x7b, 0x32, 0x7f, 0x6c,
+ 0x8c, 0xd7, 0x55, 0x7e, 0x12, 0xac, 0x3a, 0x0f,
+ 0x93, 0xec, 0x05, 0xc5, 0x2e, 0x04, 0x93, 0xef,
+ 0x31, 0xa1, 0x2d, 0x3d, 0x92, 0x60, 0xf7, 0x9a,
+ 0x28, 0x9d, 0x6a, 0x37, 0x9b, 0xc7, 0x0c, 0x50,
+ 0x84, 0x14, 0x73, 0xd1, 0xa8, 0xcc, 0x81, 0xec,
+ 0x58, 0x3e, 0x96, 0x45, 0xe0, 0x7b, 0x8d, 0x96,
+ 0x70, 0x65, 0x5b, 0xa5, 0xbb, 0xcf, 0xec, 0xc6,
+ 0xdc, 0x39, 0x66, 0x38, 0x0a, 0xd8, 0xfe, 0xcb,
+ 0x17, 0xb6, 0xba, 0x02, 0x46, 0x9a, 0x02, 0x0a,
+ 0x84, 0xe1, 0x8e, 0x8f, 0x84, 0x25, 0x20, 0x70,
+ 0xc1, 0x3e, 0x9f, 0x1f, 0x28, 0x9b, 0xe5, 0x4f,
+ 0xbc, 0x48, 0x14, 0x57, 0x77, 0x8f, 0x61, 0x60,
+ 0x15, 0xe1, 0x32, 0x7a, 0x02, 0xb1, 0x40, 0xf1,
+ 0x50, 0x5e, 0xb3, 0x09, 0x32, 0x6d, 0x68, 0x37,
+ 0x8f, 0x83, 0x74, 0x59, 0x5c, 0x84, 0x9d, 0x84,
+ 0xf4, 0xc3, 0x33, 0xec, 0x44, 0x23, 0x88, 0x51,
+ 0x43, 0xcb, 0x47, 0xbd, 0x71, 0xc5, 0xed, 0xae,
+ 0x9b, 0xe6, 0x9a, 0x2f, 0xfe, 0xce, 0xb1, 0xbe,
+ 0xc9, 0xde, 0x24, 0x4f, 0xbe, 0x15, 0x99, 0x2b,
+ 0x11, 0xb7, 0x7c, 0x04, 0x0f, 0x12, 0xbd, 0x8f,
+ 0x6a, 0x97, 0x5a, 0x44, 0xa0, 0xf9, 0x0c, 0x29,
+ 0xa9, 0xab, 0xc3, 0xd4, 0xd8, 0x93, 0x92, 0x72,
+ 0x84, 0xc5, 0x87, 0x54, 0xcc, 0xe2, 0x94, 0x52,
+ 0x9f, 0x86, 0x14, 0xdc, 0xd2, 0xab, 0xa9, 0x91,
+ 0x92, 0x5f, 0xed, 0xc4, 0xae, 0x74, 0xff, 0xac,
+ 0x6e, 0x33, 0x3b, 0x93, 0xeb, 0x4a, 0xff, 0x04,
+ 0x79, 0xda, 0x9a, 0x41, 0x0e, 0x44, 0x50, 0xe0,
+ 0xdd, 0x7a, 0xe4, 0xc6, 0xe2, 0x91, 0x09, 0x00,
+ 0x57, 0x5d, 0xa4, 0x01, 0xfc, 0x07, 0x05, 0x9f,
+ 0x64, 0x5e, 0x8b, 0x7e, 0x9b, 0xfd, 0xef, 0x33,
+ 0x94, 0x30, 0x54, 0xff, 0x84, 0x01, 0x14, 0x93,
+ 0xc2, 0x7b, 0x34, 0x29, 0xea, 0xed, 0xb4, 0xed,
+ 0x53, 0x76, 0x44, 0x1a, 0x77, 0xed, 0x43, 0x85,
+ 0x1a, 0xd7, 0x7f, 0x16, 0xf5, 0x41, 0xdf, 0xd2,
+ 0x69, 0xd5, 0x0d, 0x6a, 0x5f, 0x14, 0xfb, 0x0a,
+ 0xab, 0x1c, 0xbb, 0x4c, 0x15, 0x50, 0xbe, 0x97,
+ 0xf7, 0xab, 0x40, 0x66, 0x19, 0x3c, 0x4c, 0xaa,
+ 0x77, 0x3d, 0xad, 0x38, 0x01, 0x4b, 0xd2, 0x09,
+ 0x2f, 0xa7, 0x55, 0xc8, 0x24, 0xbb, 0x5e, 0x54,
+ 0xc4, 0xf3, 0x6f, 0xfd, 0xa9, 0xfc, 0xea, 0x70,
+ 0xb9, 0xc6, 0xe6, 0x93, 0xe1, 0x48, 0xc1, 0x51
+};
+
+/*
+ * Vector 11
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 77a31251618a15e6b92d1d66dffe7b50b50bad552305ba0217a610688eff7e11
+ * CTX e1d0225438e093242d6db274fde801d4cae06f2092c728b2478559df58e837c2
+ * CTX 469ee4a4fa794e4bbc7f39bc026e3cb72c33b0888f25b4acf56a2a9804f1ce6d
+ * CTX 3d6e1dc6ca181d4b546179d55544aa7760c40d06741539c7e3cd9d2f6650b201
+ * CTX 3fd0eeb8c2b8e3d8d240ccae2d4c98320a7442e1c8d75a42d6e6cfa4c2eca179
+ * CTX 8d158c7aecdf82490f24bb9b38e108bcda12c3faf9a21141c3613b58367f922a
+ * CTX aa26cd22f23d708dae699ad7cb40a8ad0b6e2784973dcb605684c08b8d6998c6
+ * CTX 9aac049921871ebb65301a4619ca80ecb485a31d744223ce8ddc2394828d6a80
+ * CTX 470c092f5ba413c3378fa6054255c6f9df4495862bbb3287681f931b687c888a
+ * CTX bf844dfc8fc28331e579928cd12bd2390ae123cf03818d14dedde5c0c24c8ab0
+ * CTX 18bfca75ca096f2d531f3d1619e785f1ada437cab92e980558b3dce1474afb75
+ * CTX bfedbf8ff54cb2618e0244c9ac0d3c66fb51598cd2db11f9be39791abe447c63
+ * CTX 094f7c453b7ff87cb5bb36b7c79efb0872d17058b83b15ab0866ad8a58656c5a
+ * CTX 7e20dbdf308b2461d97c0ec0024a2715055249cf3b478ddd4740de654f75ca68
+ * CTX 6e0d7345c69ed50cdc2a8b332b1f8824108ac937eb050585608ee734097fc090
+ * CTX 54fbff89eeaeea791f4a7ab1f9868294a4f9e27b42af8100cb9d59cef9645803
+ * Plaintext length (bytes): 512
+ *
+*/
+static uint8_t v11_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v11_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v11_TW[16] = {
+ 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v11_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v11_CTX[512] = {
+ 0x77, 0xa3, 0x12, 0x51, 0x61, 0x8a, 0x15, 0xe6,
+ 0xb9, 0x2d, 0x1d, 0x66, 0xdf, 0xfe, 0x7b, 0x50,
+ 0xb5, 0x0b, 0xad, 0x55, 0x23, 0x05, 0xba, 0x02,
+ 0x17, 0xa6, 0x10, 0x68, 0x8e, 0xff, 0x7e, 0x11,
+ 0xe1, 0xd0, 0x22, 0x54, 0x38, 0xe0, 0x93, 0x24,
+ 0x2d, 0x6d, 0xb2, 0x74, 0xfd, 0xe8, 0x01, 0xd4,
+ 0xca, 0xe0, 0x6f, 0x20, 0x92, 0xc7, 0x28, 0xb2,
+ 0x47, 0x85, 0x59, 0xdf, 0x58, 0xe8, 0x37, 0xc2,
+ 0x46, 0x9e, 0xe4, 0xa4, 0xfa, 0x79, 0x4e, 0x4b,
+ 0xbc, 0x7f, 0x39, 0xbc, 0x02, 0x6e, 0x3c, 0xb7,
+ 0x2c, 0x33, 0xb0, 0x88, 0x8f, 0x25, 0xb4, 0xac,
+ 0xf5, 0x6a, 0x2a, 0x98, 0x04, 0xf1, 0xce, 0x6d,
+ 0x3d, 0x6e, 0x1d, 0xc6, 0xca, 0x18, 0x1d, 0x4b,
+ 0x54, 0x61, 0x79, 0xd5, 0x55, 0x44, 0xaa, 0x77,
+ 0x60, 0xc4, 0x0d, 0x06, 0x74, 0x15, 0x39, 0xc7,
+ 0xe3, 0xcd, 0x9d, 0x2f, 0x66, 0x50, 0xb2, 0x01,
+ 0x3f, 0xd0, 0xee, 0xb8, 0xc2, 0xb8, 0xe3, 0xd8,
+ 0xd2, 0x40, 0xcc, 0xae, 0x2d, 0x4c, 0x98, 0x32,
+ 0x0a, 0x74, 0x42, 0xe1, 0xc8, 0xd7, 0x5a, 0x42,
+ 0xd6, 0xe6, 0xcf, 0xa4, 0xc2, 0xec, 0xa1, 0x79,
+ 0x8d, 0x15, 0x8c, 0x7a, 0xec, 0xdf, 0x82, 0x49,
+ 0x0f, 0x24, 0xbb, 0x9b, 0x38, 0xe1, 0x08, 0xbc,
+ 0xda, 0x12, 0xc3, 0xfa, 0xf9, 0xa2, 0x11, 0x41,
+ 0xc3, 0x61, 0x3b, 0x58, 0x36, 0x7f, 0x92, 0x2a,
+ 0xaa, 0x26, 0xcd, 0x22, 0xf2, 0x3d, 0x70, 0x8d,
+ 0xae, 0x69, 0x9a, 0xd7, 0xcb, 0x40, 0xa8, 0xad,
+ 0x0b, 0x6e, 0x27, 0x84, 0x97, 0x3d, 0xcb, 0x60,
+ 0x56, 0x84, 0xc0, 0x8b, 0x8d, 0x69, 0x98, 0xc6,
+ 0x9a, 0xac, 0x04, 0x99, 0x21, 0x87, 0x1e, 0xbb,
+ 0x65, 0x30, 0x1a, 0x46, 0x19, 0xca, 0x80, 0xec,
+ 0xb4, 0x85, 0xa3, 0x1d, 0x74, 0x42, 0x23, 0xce,
+ 0x8d, 0xdc, 0x23, 0x94, 0x82, 0x8d, 0x6a, 0x80,
+ 0x47, 0x0c, 0x09, 0x2f, 0x5b, 0xa4, 0x13, 0xc3,
+ 0x37, 0x8f, 0xa6, 0x05, 0x42, 0x55, 0xc6, 0xf9,
+ 0xdf, 0x44, 0x95, 0x86, 0x2b, 0xbb, 0x32, 0x87,
+ 0x68, 0x1f, 0x93, 0x1b, 0x68, 0x7c, 0x88, 0x8a,
+ 0xbf, 0x84, 0x4d, 0xfc, 0x8f, 0xc2, 0x83, 0x31,
+ 0xe5, 0x79, 0x92, 0x8c, 0xd1, 0x2b, 0xd2, 0x39,
+ 0x0a, 0xe1, 0x23, 0xcf, 0x03, 0x81, 0x8d, 0x14,
+ 0xde, 0xdd, 0xe5, 0xc0, 0xc2, 0x4c, 0x8a, 0xb0,
+ 0x18, 0xbf, 0xca, 0x75, 0xca, 0x09, 0x6f, 0x2d,
+ 0x53, 0x1f, 0x3d, 0x16, 0x19, 0xe7, 0x85, 0xf1,
+ 0xad, 0xa4, 0x37, 0xca, 0xb9, 0x2e, 0x98, 0x05,
+ 0x58, 0xb3, 0xdc, 0xe1, 0x47, 0x4a, 0xfb, 0x75,
+ 0xbf, 0xed, 0xbf, 0x8f, 0xf5, 0x4c, 0xb2, 0x61,
+ 0x8e, 0x02, 0x44, 0xc9, 0xac, 0x0d, 0x3c, 0x66,
+ 0xfb, 0x51, 0x59, 0x8c, 0xd2, 0xdb, 0x11, 0xf9,
+ 0xbe, 0x39, 0x79, 0x1a, 0xbe, 0x44, 0x7c, 0x63,
+ 0x09, 0x4f, 0x7c, 0x45, 0x3b, 0x7f, 0xf8, 0x7c,
+ 0xb5, 0xbb, 0x36, 0xb7, 0xc7, 0x9e, 0xfb, 0x08,
+ 0x72, 0xd1, 0x70, 0x58, 0xb8, 0x3b, 0x15, 0xab,
+ 0x08, 0x66, 0xad, 0x8a, 0x58, 0x65, 0x6c, 0x5a,
+ 0x7e, 0x20, 0xdb, 0xdf, 0x30, 0x8b, 0x24, 0x61,
+ 0xd9, 0x7c, 0x0e, 0xc0, 0x02, 0x4a, 0x27, 0x15,
+ 0x05, 0x52, 0x49, 0xcf, 0x3b, 0x47, 0x8d, 0xdd,
+ 0x47, 0x40, 0xde, 0x65, 0x4f, 0x75, 0xca, 0x68,
+ 0x6e, 0x0d, 0x73, 0x45, 0xc6, 0x9e, 0xd5, 0x0c,
+ 0xdc, 0x2a, 0x8b, 0x33, 0x2b, 0x1f, 0x88, 0x24,
+ 0x10, 0x8a, 0xc9, 0x37, 0xeb, 0x05, 0x05, 0x85,
+ 0x60, 0x8e, 0xe7, 0x34, 0x09, 0x7f, 0xc0, 0x90,
+ 0x54, 0xfb, 0xff, 0x89, 0xee, 0xae, 0xea, 0x79,
+ 0x1f, 0x4a, 0x7a, 0xb1, 0xf9, 0x86, 0x82, 0x94,
+ 0xa4, 0xf9, 0xe2, 0x7b, 0x42, 0xaf, 0x81, 0x00,
+ 0xcb, 0x9d, 0x59, 0xce, 0xf9, 0x64, 0x58, 0x03
+};
+
+/*
+ * Vector 12
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX e387aaa58ba483afa7e8eb469778317ecf4cf573aa9d4eac23f2cdf914e4e200
+ * CTX a8b490e42ee646802dc6ee2b471b278195d60918ececb44bf79966f83faba049
+ * CTX 9298ebc699c0c8634715a320bb4f075d622e74c8c932004f25b41e361025b5a8
+ * CTX 7815391f6108fc4afa6a05d9303c6ba68a128a55705d415985832fdeaae6c8e1
+ * CTX 9110e84d1b1f199a2692119edc96132658f09da7c623efcec712537a3d94c0bf
+ * CTX 5d7e352ec94ae5797fdb377dc1551150721adf15bd26a8efc2fcaad56881fa9e
+ * CTX 62462c28f30ae1ceaca93c345cf243b73f542e2074a705bd2643bb9f7cc79bb6
+ * CTX e7091ea6e232df0f9ad0d6cf502327876d82207abf2115cdacf6d5a48f6c1879
+ * CTX a65b115f0f8b3cb3c59d15dd8c769bc014795a1837f3901b5845eb491adfefe0
+ * CTX 97b1fa30a12fc1f65ba22905031539971a10f2f36c321bb51331cdefb39e3964
+ * CTX c7ef079994f5b69b2edd83a71ef549971ee93f44eac3938fcdd61d01fa71799d
+ * CTX a3a8091c4c48aa9ed263ff0749df95d44fef6a0bb578ec69456aa5408ae32c7a
+ * CTX f08ad7ba8921287e3bbee31b767be06a0e705c864a769137df28292283ea81a2
+ * CTX 480241b44d9921cdbec1bc28dc1fda114bd8e5217ac9d8ebafa720e9da4f9ace
+ * CTX 231cc949e5b96fe76ffc21063fddc83a6b8679c00d35e09576a875305bed5f36
+ * CTX ed242c8900dd1fa965bc950dfce09b132263a1eef52dd6888c309f5a7d712826
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v12_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v12_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v12_TW[16] = {
+ 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v12_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v12_CTX[512] = {
+ 0xe3, 0x87, 0xaa, 0xa5, 0x8b, 0xa4, 0x83, 0xaf,
+ 0xa7, 0xe8, 0xeb, 0x46, 0x97, 0x78, 0x31, 0x7e,
+ 0xcf, 0x4c, 0xf5, 0x73, 0xaa, 0x9d, 0x4e, 0xac,
+ 0x23, 0xf2, 0xcd, 0xf9, 0x14, 0xe4, 0xe2, 0x00,
+ 0xa8, 0xb4, 0x90, 0xe4, 0x2e, 0xe6, 0x46, 0x80,
+ 0x2d, 0xc6, 0xee, 0x2b, 0x47, 0x1b, 0x27, 0x81,
+ 0x95, 0xd6, 0x09, 0x18, 0xec, 0xec, 0xb4, 0x4b,
+ 0xf7, 0x99, 0x66, 0xf8, 0x3f, 0xab, 0xa0, 0x49,
+ 0x92, 0x98, 0xeb, 0xc6, 0x99, 0xc0, 0xc8, 0x63,
+ 0x47, 0x15, 0xa3, 0x20, 0xbb, 0x4f, 0x07, 0x5d,
+ 0x62, 0x2e, 0x74, 0xc8, 0xc9, 0x32, 0x00, 0x4f,
+ 0x25, 0xb4, 0x1e, 0x36, 0x10, 0x25, 0xb5, 0xa8,
+ 0x78, 0x15, 0x39, 0x1f, 0x61, 0x08, 0xfc, 0x4a,
+ 0xfa, 0x6a, 0x05, 0xd9, 0x30, 0x3c, 0x6b, 0xa6,
+ 0x8a, 0x12, 0x8a, 0x55, 0x70, 0x5d, 0x41, 0x59,
+ 0x85, 0x83, 0x2f, 0xde, 0xaa, 0xe6, 0xc8, 0xe1,
+ 0x91, 0x10, 0xe8, 0x4d, 0x1b, 0x1f, 0x19, 0x9a,
+ 0x26, 0x92, 0x11, 0x9e, 0xdc, 0x96, 0x13, 0x26,
+ 0x58, 0xf0, 0x9d, 0xa7, 0xc6, 0x23, 0xef, 0xce,
+ 0xc7, 0x12, 0x53, 0x7a, 0x3d, 0x94, 0xc0, 0xbf,
+ 0x5d, 0x7e, 0x35, 0x2e, 0xc9, 0x4a, 0xe5, 0x79,
+ 0x7f, 0xdb, 0x37, 0x7d, 0xc1, 0x55, 0x11, 0x50,
+ 0x72, 0x1a, 0xdf, 0x15, 0xbd, 0x26, 0xa8, 0xef,
+ 0xc2, 0xfc, 0xaa, 0xd5, 0x68, 0x81, 0xfa, 0x9e,
+ 0x62, 0x46, 0x2c, 0x28, 0xf3, 0x0a, 0xe1, 0xce,
+ 0xac, 0xa9, 0x3c, 0x34, 0x5c, 0xf2, 0x43, 0xb7,
+ 0x3f, 0x54, 0x2e, 0x20, 0x74, 0xa7, 0x05, 0xbd,
+ 0x26, 0x43, 0xbb, 0x9f, 0x7c, 0xc7, 0x9b, 0xb6,
+ 0xe7, 0x09, 0x1e, 0xa6, 0xe2, 0x32, 0xdf, 0x0f,
+ 0x9a, 0xd0, 0xd6, 0xcf, 0x50, 0x23, 0x27, 0x87,
+ 0x6d, 0x82, 0x20, 0x7a, 0xbf, 0x21, 0x15, 0xcd,
+ 0xac, 0xf6, 0xd5, 0xa4, 0x8f, 0x6c, 0x18, 0x79,
+ 0xa6, 0x5b, 0x11, 0x5f, 0x0f, 0x8b, 0x3c, 0xb3,
+ 0xc5, 0x9d, 0x15, 0xdd, 0x8c, 0x76, 0x9b, 0xc0,
+ 0x14, 0x79, 0x5a, 0x18, 0x37, 0xf3, 0x90, 0x1b,
+ 0x58, 0x45, 0xeb, 0x49, 0x1a, 0xdf, 0xef, 0xe0,
+ 0x97, 0xb1, 0xfa, 0x30, 0xa1, 0x2f, 0xc1, 0xf6,
+ 0x5b, 0xa2, 0x29, 0x05, 0x03, 0x15, 0x39, 0x97,
+ 0x1a, 0x10, 0xf2, 0xf3, 0x6c, 0x32, 0x1b, 0xb5,
+ 0x13, 0x31, 0xcd, 0xef, 0xb3, 0x9e, 0x39, 0x64,
+ 0xc7, 0xef, 0x07, 0x99, 0x94, 0xf5, 0xb6, 0x9b,
+ 0x2e, 0xdd, 0x83, 0xa7, 0x1e, 0xf5, 0x49, 0x97,
+ 0x1e, 0xe9, 0x3f, 0x44, 0xea, 0xc3, 0x93, 0x8f,
+ 0xcd, 0xd6, 0x1d, 0x01, 0xfa, 0x71, 0x79, 0x9d,
+ 0xa3, 0xa8, 0x09, 0x1c, 0x4c, 0x48, 0xaa, 0x9e,
+ 0xd2, 0x63, 0xff, 0x07, 0x49, 0xdf, 0x95, 0xd4,
+ 0x4f, 0xef, 0x6a, 0x0b, 0xb5, 0x78, 0xec, 0x69,
+ 0x45, 0x6a, 0xa5, 0x40, 0x8a, 0xe3, 0x2c, 0x7a,
+ 0xf0, 0x8a, 0xd7, 0xba, 0x89, 0x21, 0x28, 0x7e,
+ 0x3b, 0xbe, 0xe3, 0x1b, 0x76, 0x7b, 0xe0, 0x6a,
+ 0x0e, 0x70, 0x5c, 0x86, 0x4a, 0x76, 0x91, 0x37,
+ 0xdf, 0x28, 0x29, 0x22, 0x83, 0xea, 0x81, 0xa2,
+ 0x48, 0x02, 0x41, 0xb4, 0x4d, 0x99, 0x21, 0xcd,
+ 0xbe, 0xc1, 0xbc, 0x28, 0xdc, 0x1f, 0xda, 0x11,
+ 0x4b, 0xd8, 0xe5, 0x21, 0x7a, 0xc9, 0xd8, 0xeb,
+ 0xaf, 0xa7, 0x20, 0xe9, 0xda, 0x4f, 0x9a, 0xce,
+ 0x23, 0x1c, 0xc9, 0x49, 0xe5, 0xb9, 0x6f, 0xe7,
+ 0x6f, 0xfc, 0x21, 0x06, 0x3f, 0xdd, 0xc8, 0x3a,
+ 0x6b, 0x86, 0x79, 0xc0, 0x0d, 0x35, 0xe0, 0x95,
+ 0x76, 0xa8, 0x75, 0x30, 0x5b, 0xed, 0x5f, 0x36,
+ 0xed, 0x24, 0x2c, 0x89, 0x00, 0xdd, 0x1f, 0xa9,
+ 0x65, 0xbc, 0x95, 0x0d, 0xfc, 0xe0, 0x9b, 0x13,
+ 0x22, 0x63, 0xa1, 0xee, 0xf5, 0x2d, 0xd6, 0x88,
+ 0x8c, 0x30, 0x9f, 0x5a, 0x7d, 0x71, 0x28, 0x26
+};
+
+/*
+ * Vector 13
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX bf53d2dade78e822a4d949a9bc6766b01b06a8ef70d26748c6a7fc36d80ae4c5
+ * CTX 520f7c4ab0ac8544424fa405162fef5a6b7f229498063618d39f0003cb5fb8d1
+ * CTX c86b643497da1ff945c8d3bedeca4f479702a7a735f043ddb1d6aaade3c4a0ac
+ * CTX 7ca7f3fa5279bef56f82cd7a2f38672e824814e10700300a055e1630b8f1cb0e
+ * CTX 919f5e942010a416e2bf48cb46993d3cb6a51c19bacf864785a00bc2ecff15d3
+ * CTX 50875b246ed53e68be6f55bd7e05cfc2b2ed6432198a6444b6d8c247fab941f5
+ * CTX 69768b5c429366f1d3f00f0345b96123d56204c01c63b22ce78baf116e525ed9
+ * CTX 0fdea39fa469494d3866c31e05f295ff21fea8d4e6e13d67e47ce722e9698a1c
+ * CTX 1048d68ebcde76b86fcf976eab8aa9790268b7068e017a8b9b749409514f1053
+ * CTX 027fd16c3786ea1bac5f15cb79711ee2abe82f5cf8b13ae73030ef5b9e4457e7
+ * CTX 5d1304f988d62dd6fc4b94ed38ba831da4b7634971b6cd8ec325d9c61c00f1df
+ * CTX 73627ed3745a5e8489f3a95c69639c32cd6e1d537a85f75cc844726e8a72fc00
+ * CTX 77ad22000f1d5078f6b866318c668f1ad03d5a5fced5219f2eabbd0aa5c0f460
+ * CTX d183f04404a0d6f469558e81fab24a167905ab4c7878502ad3e38fdbe62a4155
+ * CTX 6cec37325759533ce8f25f367c87bb5578d667ae93f9e2fd99bcbc5f2fbba88c
+ * CTX f6516139420fcff3b7361d86322c4bd84c82f335abb152c4a93411373aaa8220
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v13_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v13_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v13_TW[16] = {
+ 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v13_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v13_CTX[512] = {
+ 0xbf, 0x53, 0xd2, 0xda, 0xde, 0x78, 0xe8, 0x22,
+ 0xa4, 0xd9, 0x49, 0xa9, 0xbc, 0x67, 0x66, 0xb0,
+ 0x1b, 0x06, 0xa8, 0xef, 0x70, 0xd2, 0x67, 0x48,
+ 0xc6, 0xa7, 0xfc, 0x36, 0xd8, 0x0a, 0xe4, 0xc5,
+ 0x52, 0x0f, 0x7c, 0x4a, 0xb0, 0xac, 0x85, 0x44,
+ 0x42, 0x4f, 0xa4, 0x05, 0x16, 0x2f, 0xef, 0x5a,
+ 0x6b, 0x7f, 0x22, 0x94, 0x98, 0x06, 0x36, 0x18,
+ 0xd3, 0x9f, 0x00, 0x03, 0xcb, 0x5f, 0xb8, 0xd1,
+ 0xc8, 0x6b, 0x64, 0x34, 0x97, 0xda, 0x1f, 0xf9,
+ 0x45, 0xc8, 0xd3, 0xbe, 0xde, 0xca, 0x4f, 0x47,
+ 0x97, 0x02, 0xa7, 0xa7, 0x35, 0xf0, 0x43, 0xdd,
+ 0xb1, 0xd6, 0xaa, 0xad, 0xe3, 0xc4, 0xa0, 0xac,
+ 0x7c, 0xa7, 0xf3, 0xfa, 0x52, 0x79, 0xbe, 0xf5,
+ 0x6f, 0x82, 0xcd, 0x7a, 0x2f, 0x38, 0x67, 0x2e,
+ 0x82, 0x48, 0x14, 0xe1, 0x07, 0x00, 0x30, 0x0a,
+ 0x05, 0x5e, 0x16, 0x30, 0xb8, 0xf1, 0xcb, 0x0e,
+ 0x91, 0x9f, 0x5e, 0x94, 0x20, 0x10, 0xa4, 0x16,
+ 0xe2, 0xbf, 0x48, 0xcb, 0x46, 0x99, 0x3d, 0x3c,
+ 0xb6, 0xa5, 0x1c, 0x19, 0xba, 0xcf, 0x86, 0x47,
+ 0x85, 0xa0, 0x0b, 0xc2, 0xec, 0xff, 0x15, 0xd3,
+ 0x50, 0x87, 0x5b, 0x24, 0x6e, 0xd5, 0x3e, 0x68,
+ 0xbe, 0x6f, 0x55, 0xbd, 0x7e, 0x05, 0xcf, 0xc2,
+ 0xb2, 0xed, 0x64, 0x32, 0x19, 0x8a, 0x64, 0x44,
+ 0xb6, 0xd8, 0xc2, 0x47, 0xfa, 0xb9, 0x41, 0xf5,
+ 0x69, 0x76, 0x8b, 0x5c, 0x42, 0x93, 0x66, 0xf1,
+ 0xd3, 0xf0, 0x0f, 0x03, 0x45, 0xb9, 0x61, 0x23,
+ 0xd5, 0x62, 0x04, 0xc0, 0x1c, 0x63, 0xb2, 0x2c,
+ 0xe7, 0x8b, 0xaf, 0x11, 0x6e, 0x52, 0x5e, 0xd9,
+ 0x0f, 0xde, 0xa3, 0x9f, 0xa4, 0x69, 0x49, 0x4d,
+ 0x38, 0x66, 0xc3, 0x1e, 0x05, 0xf2, 0x95, 0xff,
+ 0x21, 0xfe, 0xa8, 0xd4, 0xe6, 0xe1, 0x3d, 0x67,
+ 0xe4, 0x7c, 0xe7, 0x22, 0xe9, 0x69, 0x8a, 0x1c,
+ 0x10, 0x48, 0xd6, 0x8e, 0xbc, 0xde, 0x76, 0xb8,
+ 0x6f, 0xcf, 0x97, 0x6e, 0xab, 0x8a, 0xa9, 0x79,
+ 0x02, 0x68, 0xb7, 0x06, 0x8e, 0x01, 0x7a, 0x8b,
+ 0x9b, 0x74, 0x94, 0x09, 0x51, 0x4f, 0x10, 0x53,
+ 0x02, 0x7f, 0xd1, 0x6c, 0x37, 0x86, 0xea, 0x1b,
+ 0xac, 0x5f, 0x15, 0xcb, 0x79, 0x71, 0x1e, 0xe2,
+ 0xab, 0xe8, 0x2f, 0x5c, 0xf8, 0xb1, 0x3a, 0xe7,
+ 0x30, 0x30, 0xef, 0x5b, 0x9e, 0x44, 0x57, 0xe7,
+ 0x5d, 0x13, 0x04, 0xf9, 0x88, 0xd6, 0x2d, 0xd6,
+ 0xfc, 0x4b, 0x94, 0xed, 0x38, 0xba, 0x83, 0x1d,
+ 0xa4, 0xb7, 0x63, 0x49, 0x71, 0xb6, 0xcd, 0x8e,
+ 0xc3, 0x25, 0xd9, 0xc6, 0x1c, 0x00, 0xf1, 0xdf,
+ 0x73, 0x62, 0x7e, 0xd3, 0x74, 0x5a, 0x5e, 0x84,
+ 0x89, 0xf3, 0xa9, 0x5c, 0x69, 0x63, 0x9c, 0x32,
+ 0xcd, 0x6e, 0x1d, 0x53, 0x7a, 0x85, 0xf7, 0x5c,
+ 0xc8, 0x44, 0x72, 0x6e, 0x8a, 0x72, 0xfc, 0x00,
+ 0x77, 0xad, 0x22, 0x00, 0x0f, 0x1d, 0x50, 0x78,
+ 0xf6, 0xb8, 0x66, 0x31, 0x8c, 0x66, 0x8f, 0x1a,
+ 0xd0, 0x3d, 0x5a, 0x5f, 0xce, 0xd5, 0x21, 0x9f,
+ 0x2e, 0xab, 0xbd, 0x0a, 0xa5, 0xc0, 0xf4, 0x60,
+ 0xd1, 0x83, 0xf0, 0x44, 0x04, 0xa0, 0xd6, 0xf4,
+ 0x69, 0x55, 0x8e, 0x81, 0xfa, 0xb2, 0x4a, 0x16,
+ 0x79, 0x05, 0xab, 0x4c, 0x78, 0x78, 0x50, 0x2a,
+ 0xd3, 0xe3, 0x8f, 0xdb, 0xe6, 0x2a, 0x41, 0x55,
+ 0x6c, 0xec, 0x37, 0x32, 0x57, 0x59, 0x53, 0x3c,
+ 0xe8, 0xf2, 0x5f, 0x36, 0x7c, 0x87, 0xbb, 0x55,
+ 0x78, 0xd6, 0x67, 0xae, 0x93, 0xf9, 0xe2, 0xfd,
+ 0x99, 0xbc, 0xbc, 0x5f, 0x2f, 0xbb, 0xa8, 0x8c,
+ 0xf6, 0x51, 0x61, 0x39, 0x42, 0x0f, 0xcf, 0xf3,
+ 0xb7, 0x36, 0x1d, 0x86, 0x32, 0x2c, 0x4b, 0xd8,
+ 0x4c, 0x82, 0xf3, 0x35, 0xab, 0xb1, 0x52, 0xc4,
+ 0xa9, 0x34, 0x11, 0x37, 0x3a, 0xaa, 0x82, 0x20
+};
+
+/*
+ * Vector 14
+ * Key1 2718281828459045235360287471352662497757247093699959574966967627
+ * Key2 3141592653589793238462643383279502884197169399375105820974944592
+ * Data Unit Sequence Number ffffffffff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * PTX 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f
+ * PTX 202122232425262728292a2b2c2d2e2f303132333435363738393a3b3c3d3e3f
+ * PTX 404142434445464748494a4b4c4d4e4f505152535455565758595a5b5c5d5e5f
+ * PTX 606162636465666768696a6b6c6d6e6f707172737475767778797a7b7c7d7e7f
+ * PTX 808182838485868788898a8b8c8d8e8f909192939495969798999a9b9c9d9e9f
+ * PTX a0a1a2a3a4a5a6a7a8a9aaabacadaeafb0b1b2b3b4b5b6b7b8b9babbbcbdbebf
+ * PTX c0c1c2c3c4c5c6c7c8c9cacbcccdcecfd0d1d2d3d4d5d6d7d8d9dadbdcdddedf
+ * PTX e0e1e2e3e4e5e6e7e8e9eaebecedeeeff0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
+ * CTX 64497e5a831e4a932c09be3e5393376daa599548b816031d224bbf50a818ed23
+ * CTX 50eae7e96087c8a0db51ad290bd00c1ac1620857635bf246c176ab463be30b80
+ * CTX 8da548081ac847b158e1264be25bb0910bbc92647108089415d45fab1b3d2604
+ * CTX e8a8eff1ae4020cfa39936b66827b23f371b92200be90251e6d73c5f86de5fd4
+ * CTX a950781933d79a28272b782a2ec313efdfcc0628f43d744c2dc2ff3dcb66999b
+ * CTX 50c7ca895b0c64791eeaa5f29499fb1c026f84ce5b5c72ba1083cddb5ce45434
+ * CTX 631665c333b60b11593fb253c5179a2c8db813782a004856a1653011e93fb6d8
+ * CTX 76c18366dd8683f53412c0c180f9c848592d593f8609ca736317d356e13e2bff
+ * CTX 3a9f59cd9aeb19cd482593d8c46128bb32423b37a9adfb482b99453fbe25a41b
+ * CTX f6feb4aa0bef5ed24bf73c762978025482c13115e4015aac992e5613a3b5c2f6
+ * CTX 85b84795cb6e9b2656d8c88157e52c42f978d8634c43d06fea928f2822e465aa
+ * CTX 6576e9bf419384506cc3ce3c54ac1a6f67dc66f3b30191e698380bc999b05abc
+ * CTX e19dc0c6dcc2dd001ec535ba18deb2df1a101023108318c75dc98611a09dc48a
+ * CTX 0acdec676fabdf222f07e026f059b672b56e5cbc8e1d21bbd867dd9272120546
+ * CTX 81d70ea737134cdfce93b6f82ae22423274e58a0821cc5502e2d0ab4585e94de
+ * CTX 6975be5e0b4efce51cd3e70c25a1fbbbd609d273ad5b0d59631c531f6a0a57b9
+ * Plaintext length (bytes): 512
+*/
+
+static uint8_t v14_key1[32] = {
+ 0x27, 0x18, 0x28, 0x18, 0x28, 0x45, 0x90, 0x45,
+ 0x23, 0x53, 0x60, 0x28, 0x74, 0x71, 0x35, 0x26,
+ 0x62, 0x49, 0x77, 0x57, 0x24, 0x70, 0x93, 0x69,
+ 0x99, 0x59, 0x57, 0x49, 0x66, 0x96, 0x76, 0x27
+};
+
+static uint8_t v14_key2[32] = {
+ 0x31, 0x41, 0x59, 0x26, 0x53, 0x58, 0x97, 0x93,
+ 0x23, 0x84, 0x62, 0x64, 0x33, 0x83, 0x27, 0x95,
+ 0x02, 0x88, 0x41, 0x97, 0x16, 0x93, 0x99, 0x37,
+ 0x51, 0x05, 0x82, 0x09, 0x74, 0x94, 0x45, 0x92
+};
+
+static uint8_t v14_TW[16] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+static uint8_t v14_PTX[512] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+};
+
+static uint8_t v14_CTX[512] = {
+ 0x64, 0x49, 0x7e, 0x5a, 0x83, 0x1e, 0x4a, 0x93,
+ 0x2c, 0x09, 0xbe, 0x3e, 0x53, 0x93, 0x37, 0x6d,
+ 0xaa, 0x59, 0x95, 0x48, 0xb8, 0x16, 0x03, 0x1d,
+ 0x22, 0x4b, 0xbf, 0x50, 0xa8, 0x18, 0xed, 0x23,
+ 0x50, 0xea, 0xe7, 0xe9, 0x60, 0x87, 0xc8, 0xa0,
+ 0xdb, 0x51, 0xad, 0x29, 0x0b, 0xd0, 0x0c, 0x1a,
+ 0xc1, 0x62, 0x08, 0x57, 0x63, 0x5b, 0xf2, 0x46,
+ 0xc1, 0x76, 0xab, 0x46, 0x3b, 0xe3, 0x0b, 0x80,
+ 0x8d, 0xa5, 0x48, 0x08, 0x1a, 0xc8, 0x47, 0xb1,
+ 0x58, 0xe1, 0x26, 0x4b, 0xe2, 0x5b, 0xb0, 0x91,
+ 0x0b, 0xbc, 0x92, 0x64, 0x71, 0x08, 0x08, 0x94,
+ 0x15, 0xd4, 0x5f, 0xab, 0x1b, 0x3d, 0x26, 0x04,
+ 0xe8, 0xa8, 0xef, 0xf1, 0xae, 0x40, 0x20, 0xcf,
+ 0xa3, 0x99, 0x36, 0xb6, 0x68, 0x27, 0xb2, 0x3f,
+ 0x37, 0x1b, 0x92, 0x20, 0x0b, 0xe9, 0x02, 0x51,
+ 0xe6, 0xd7, 0x3c, 0x5f, 0x86, 0xde, 0x5f, 0xd4,
+ 0xa9, 0x50, 0x78, 0x19, 0x33, 0xd7, 0x9a, 0x28,
+ 0x27, 0x2b, 0x78, 0x2a, 0x2e, 0xc3, 0x13, 0xef,
+ 0xdf, 0xcc, 0x06, 0x28, 0xf4, 0x3d, 0x74, 0x4c,
+ 0x2d, 0xc2, 0xff, 0x3d, 0xcb, 0x66, 0x99, 0x9b,
+ 0x50, 0xc7, 0xca, 0x89, 0x5b, 0x0c, 0x64, 0x79,
+ 0x1e, 0xea, 0xa5, 0xf2, 0x94, 0x99, 0xfb, 0x1c,
+ 0x02, 0x6f, 0x84, 0xce, 0x5b, 0x5c, 0x72, 0xba,
+ 0x10, 0x83, 0xcd, 0xdb, 0x5c, 0xe4, 0x54, 0x34,
+ 0x63, 0x16, 0x65, 0xc3, 0x33, 0xb6, 0x0b, 0x11,
+ 0x59, 0x3f, 0xb2, 0x53, 0xc5, 0x17, 0x9a, 0x2c,
+ 0x8d, 0xb8, 0x13, 0x78, 0x2a, 0x00, 0x48, 0x56,
+ 0xa1, 0x65, 0x30, 0x11, 0xe9, 0x3f, 0xb6, 0xd8,
+ 0x76, 0xc1, 0x83, 0x66, 0xdd, 0x86, 0x83, 0xf5,
+ 0x34, 0x12, 0xc0, 0xc1, 0x80, 0xf9, 0xc8, 0x48,
+ 0x59, 0x2d, 0x59, 0x3f, 0x86, 0x09, 0xca, 0x73,
+ 0x63, 0x17, 0xd3, 0x56, 0xe1, 0x3e, 0x2b, 0xff,
+ 0x3a, 0x9f, 0x59, 0xcd, 0x9a, 0xeb, 0x19, 0xcd,
+ 0x48, 0x25, 0x93, 0xd8, 0xc4, 0x61, 0x28, 0xbb,
+ 0x32, 0x42, 0x3b, 0x37, 0xa9, 0xad, 0xfb, 0x48,
+ 0x2b, 0x99, 0x45, 0x3f, 0xbe, 0x25, 0xa4, 0x1b,
+ 0xf6, 0xfe, 0xb4, 0xaa, 0x0b, 0xef, 0x5e, 0xd2,
+ 0x4b, 0xf7, 0x3c, 0x76, 0x29, 0x78, 0x02, 0x54,
+ 0x82, 0xc1, 0x31, 0x15, 0xe4, 0x01, 0x5a, 0xac,
+ 0x99, 0x2e, 0x56, 0x13, 0xa3, 0xb5, 0xc2, 0xf6,
+ 0x85, 0xb8, 0x47, 0x95, 0xcb, 0x6e, 0x9b, 0x26,
+ 0x56, 0xd8, 0xc8, 0x81, 0x57, 0xe5, 0x2c, 0x42,
+ 0xf9, 0x78, 0xd8, 0x63, 0x4c, 0x43, 0xd0, 0x6f,
+ 0xea, 0x92, 0x8f, 0x28, 0x22, 0xe4, 0x65, 0xaa,
+ 0x65, 0x76, 0xe9, 0xbf, 0x41, 0x93, 0x84, 0x50,
+ 0x6c, 0xc3, 0xce, 0x3c, 0x54, 0xac, 0x1a, 0x6f,
+ 0x67, 0xdc, 0x66, 0xf3, 0xb3, 0x01, 0x91, 0xe6,
+ 0x98, 0x38, 0x0b, 0xc9, 0x99, 0xb0, 0x5a, 0xbc,
+ 0xe1, 0x9d, 0xc0, 0xc6, 0xdc, 0xc2, 0xdd, 0x00,
+ 0x1e, 0xc5, 0x35, 0xba, 0x18, 0xde, 0xb2, 0xdf,
+ 0x1a, 0x10, 0x10, 0x23, 0x10, 0x83, 0x18, 0xc7,
+ 0x5d, 0xc9, 0x86, 0x11, 0xa0, 0x9d, 0xc4, 0x8a,
+ 0x0a, 0xcd, 0xec, 0x67, 0x6f, 0xab, 0xdf, 0x22,
+ 0x2f, 0x07, 0xe0, 0x26, 0xf0, 0x59, 0xb6, 0x72,
+ 0xb5, 0x6e, 0x5c, 0xbc, 0x8e, 0x1d, 0x21, 0xbb,
+ 0xd8, 0x67, 0xdd, 0x92, 0x72, 0x12, 0x05, 0x46,
+ 0x81, 0xd7, 0x0e, 0xa7, 0x37, 0x13, 0x4c, 0xdf,
+ 0xce, 0x93, 0xb6, 0xf8, 0x2a, 0xe2, 0x24, 0x23,
+ 0x27, 0x4e, 0x58, 0xa0, 0x82, 0x1c, 0xc5, 0x50,
+ 0x2e, 0x2d, 0x0a, 0xb4, 0x58, 0x5e, 0x94, 0xde,
+ 0x69, 0x75, 0xbe, 0x5e, 0x0b, 0x4e, 0xfc, 0xe5,
+ 0x1c, 0xd3, 0xe7, 0x0c, 0x25, 0xa1, 0xfb, 0xbb,
+ 0xd6, 0x09, 0xd2, 0x73, 0xad, 0x5b, 0x0d, 0x59,
+ 0x63, 0x1c, 0x53, 0x1f, 0x6a, 0x0a, 0x57, 0xb9
+};
+
+//
+// Define vector of structs, with pointers to the statically defined vectors
+
+struct xts_vector vlist[NVEC] = {
+
+ // pointers to the statically defined vectors here
+
+ // Vector 10
+ {sizeof(v10_CTX), v10_key1, v10_key2, v10_TW, v10_PTX, v10_CTX}
+ ,
+ // Vector 11
+ {sizeof(v11_CTX), v11_key1, v11_key2, v11_TW, v11_PTX, v11_CTX}
+ ,
+ // Vector 12
+ {sizeof(v12_CTX), v12_key1, v12_key2, v12_TW, v12_PTX, v12_CTX}
+ ,
+ // Vector 13
+ {sizeof(v13_CTX), v13_key1, v13_key2, v13_TW, v13_PTX, v13_CTX}
+ ,
+ // Vector 14
+ {sizeof(v14_CTX), v14_key1, v14_key2, v14_TW, v14_PTX, v14_CTX}
+
+};
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm
new file mode 100644
index 000000000..2635f998d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_128_multibinary.asm
@@ -0,0 +1,78 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern XTS_AES_128_enc_sse
+extern XTS_AES_128_enc_avx
+
+extern XTS_AES_128_enc_expanded_key_sse
+extern XTS_AES_128_enc_expanded_key_avx
+
+extern XTS_AES_128_dec_sse
+extern XTS_AES_128_dec_avx
+
+extern XTS_AES_128_dec_expanded_key_sse
+extern XTS_AES_128_dec_expanded_key_avx
+
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate XTS_AES_128_enc, XTS_AES_128_enc_expanded_key, XTS_AES_128_dec, and XTS_AES_128_dec_expanded_key
+;;;;
+mbin_interface XTS_AES_128_enc
+mbin_dispatch_init XTS_AES_128_enc, XTS_AES_128_enc_sse, XTS_AES_128_enc_avx, XTS_AES_128_enc_avx
+
+mbin_interface XTS_AES_128_enc_expanded_key
+mbin_dispatch_init XTS_AES_128_enc_expanded_key, XTS_AES_128_enc_expanded_key_sse, XTS_AES_128_enc_expanded_key_avx, XTS_AES_128_enc_expanded_key_avx
+
+mbin_interface XTS_AES_128_dec
+mbin_dispatch_init XTS_AES_128_dec, XTS_AES_128_dec_sse, XTS_AES_128_dec_avx, XTS_AES_128_dec_avx
+
+mbin_interface XTS_AES_128_dec_expanded_key
+mbin_dispatch_init XTS_AES_128_dec_expanded_key, XTS_AES_128_dec_expanded_key_sse, XTS_AES_128_dec_expanded_key_avx, XTS_AES_128_dec_expanded_key_avx
+
+
+;;; func core, ver, snum
+slversion XTS_AES_128_enc, 01, 04, 0071
+slversion XTS_AES_128_enc_expanded_key, 01, 04, 0072
+slversion XTS_AES_128_dec, 01, 04, 0073
+slversion XTS_AES_128_dec_expanded_key, 01, 04, 0074
diff --git a/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm
new file mode 100644
index 000000000..3452b4f24
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/aes/xts_aes_256_multibinary.asm
@@ -0,0 +1,78 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+default rel
+[bits 64]
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+
+extern XTS_AES_256_enc_sse
+extern XTS_AES_256_enc_avx
+
+extern XTS_AES_256_enc_expanded_key_sse
+extern XTS_AES_256_enc_expanded_key_avx
+
+extern XTS_AES_256_dec_sse
+extern XTS_AES_256_dec_avx
+
+extern XTS_AES_256_dec_expanded_key_sse
+extern XTS_AES_256_dec_expanded_key_avx
+
+
+section .text
+
+%include "multibinary.asm"
+
+;;;;
+; instantiate XTS_AES_256_enc, XTS_AES_256_enc_expanded_key, XTS_AES_256_dec, and XTS_AES_256_dec_expanded_key
+;;;;
+mbin_interface XTS_AES_256_enc
+mbin_dispatch_init XTS_AES_256_enc, XTS_AES_256_enc_sse, XTS_AES_256_enc_avx, XTS_AES_256_enc_avx
+
+mbin_interface XTS_AES_256_enc_expanded_key
+mbin_dispatch_init XTS_AES_256_enc_expanded_key, XTS_AES_256_enc_expanded_key_sse, XTS_AES_256_enc_expanded_key_avx, XTS_AES_256_enc_expanded_key_avx
+
+mbin_interface XTS_AES_256_dec
+mbin_dispatch_init XTS_AES_256_dec, XTS_AES_256_dec_sse, XTS_AES_256_dec_avx, XTS_AES_256_dec_avx
+
+mbin_interface XTS_AES_256_dec_expanded_key
+mbin_dispatch_init XTS_AES_256_dec_expanded_key, XTS_AES_256_dec_expanded_key_sse, XTS_AES_256_dec_expanded_key_avx, XTS_AES_256_dec_expanded_key_avx
+
+
+;;; func core, ver, snum
+slversion XTS_AES_256_enc, 01, 04, 0076
+slversion XTS_AES_256_enc_expanded_key, 01, 04, 0077
+slversion XTS_AES_256_dec, 01, 04, 0078
+slversion XTS_AES_256_dec_expanded_key, 01, 04, 0079
diff --git a/src/crypto/isa-l/isa-l_crypto/autogen.sh b/src/crypto/isa-l/isa-l_crypto/autogen.sh
new file mode 100755
index 000000000..0a3189383
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/autogen.sh
@@ -0,0 +1,17 @@
+#!/bin/sh -e
+
+autoreconf --install --symlink -f
+
+libdir() {
+ echo $(cd $1/$(gcc -print-multi-os-directory); pwd)
+}
+
+args="--prefix=/usr --libdir=$(libdir /usr/lib)"
+
+echo
+echo "----------------------------------------------------------------"
+echo "Initialized build system. For a common configuration please run:"
+echo "----------------------------------------------------------------"
+echo
+echo "./configure $args"
+echo
diff --git a/src/crypto/isa-l/isa-l_crypto/configure.ac b/src/crypto/isa-l/isa-l_crypto/configure.ac
new file mode 100644
index 000000000..f074e0f84
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/configure.ac
@@ -0,0 +1,174 @@
+# -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ(2.69)
+AC_INIT([libisal_crypto],
+ [2.16.0],
+ [sg.support.isal@intel.com],
+ [isa-l_crypto],
+ [http://01.org/storage-acceleration-library])
+AC_CONFIG_SRCDIR([])
+AC_CONFIG_AUX_DIR([build-aux])
+AM_INIT_AUTOMAKE([
+ foreign
+ 1.11
+ -Wall
+ -Wno-portability
+ silent-rules
+ tar-pax
+ no-dist-gzip
+ dist-xz
+ subdir-objects
+])
+AM_PROG_AS
+
+# Check for programs
+AC_PROG_CC_STDC
+AC_USE_SYSTEM_EXTENSIONS
+AM_SILENT_RULES([yes])
+LT_INIT
+AC_PREFIX_DEFAULT([/usr])
+AC_PROG_SED
+AC_PROG_MKDIR_P
+
+# Options
+AC_ARG_ENABLE([debug],
+ AS_HELP_STRING([--enable-debug], [enable debug messages @<:@default=disabled@:>@]),
+ [], [enable_debug=no])
+AS_IF([test "x$enable_debug" = "xyes"], [
+ AC_DEFINE(ENABLE_DEBUG, [1], [Debug messages.])
+])
+
+# Check for yasm and yasm features
+AC_CHECK_PROG(HAVE_YASM, yasm, yes, no)
+if test "$HAVE_YASM" = "no"; then
+ AC_MSG_RESULT([no yasm])
+else
+ AC_MSG_CHECKING([for modern yasm])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vmovdqa %xmm0, %xmm1;]])])
+ if yasm -f elf64 -p gas conftest.c ; then
+ with_modern_yasm=yes
+ AC_MSG_RESULT([yes])
+ AC_MSG_CHECKING([for optional yasm AVX512 support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpshufb %zmm0, %zmm1, %zmm2;]])])
+ if yasm -f elf64 -p gas conftest.c 2> /dev/null; then
+ yasm_knows_avx512=yes
+ AC_MSG_RESULT([yes])
+ else
+ AC_MSG_RESULT([no])
+ fi
+ else
+ AC_MSG_FAILURE([no])
+ fi
+fi
+
+# Check for nasm and nasm features
+AC_CHECK_PROG(HAVE_NASM, nasm, yes, no)
+if test "$HAVE_NASM" = "no"; then
+ AC_MSG_RESULT([no nasm])
+else
+ AC_MSG_CHECKING([for modern nasm])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[pblendvb xmm2, xmm1;]])])
+ sed -i -e '/pblendvb/!d' conftest.c
+ if nasm -f elf64 conftest.c 2> /dev/null; then
+ with_modern_nasm=yes
+ AC_MSG_RESULT([yes])
+ AC_MSG_CHECKING([for optional nasm AVX512 support])
+ AC_LANG_CONFTEST([AC_LANG_SOURCE([[vpshufb zmm0, zmm1, zmm2;]])])
+ sed -i -e '/vpshufb/!d' conftest.c
+ if nasm -f elf64 conftest.c 2> /dev/null; then
+ nasm_knows_avx512=yes
+ AC_MSG_RESULT([yes])
+ else
+ AC_MSG_RESULT([no])
+ fi
+ else
+ AC_MSG_RESULT([no])
+ fi
+fi
+
+# Pick an assembler yasm or nasm
+if test x"$AS" = x""; then
+ if test x"$yasm_knows_avx512" = x"yes"; then
+ AS=yasm
+ elif test x"$nasm_knows_avx512" = x"yes"; then
+ AS=nasm
+ elif test x"$with_modern_yasm" = x"yes"; then
+ AS=yasm
+ elif test x"$with_modern_nasm" = x"yes"; then
+ AS=nasm
+ else
+ AC_MSG_ERROR([No modern yasm or nasm found as required. Yasm should be 1.2.0 or later, and nasm should be v2.11.01 or later.])
+ fi
+fi
+echo "Using assembler $AS"
+
+if test \( x"$AS" = x"yasm" -a x"$yasm_knows_avx512" = x"yes" \) -o \( x"$AS" = x"nasm" -a x"$nasm_knows_avx512" = x"yes" \); then
+ AC_DEFINE(HAVE_AS_KNOWS_AVX512, [1], [Assembler can do AVX512.])
+ have_as_knows_avx512=yes
+else
+ AC_MSG_RESULT([Assembler does not understand AVX512 opcodes. Consider upgrading for best performance.])
+fi
+AM_CONDITIONAL(USE_YASM, test x"$AS" = x"yasm")
+AM_CONDITIONAL(USE_NASM, test x"$AS" = x"nasm")
+AM_CONDITIONAL(WITH_AVX512, test x"$have_as_knows_avx512" = x"yes")
+
+
+case $target in
+ *linux*) arch=linux yasm_args="-f elf64";;
+ *darwin*) arch=darwin yasm_args="-f macho64 --prefix=_ ";;
+ *netbsd*) arch=netbsd yasm_args="-f elf64";;
+ *) arch=unknown yasm_args="-f elf64";;
+esac
+AC_SUBST([yasm_args])
+AM_CONDITIONAL(DARWIN, test x"$arch" = x"darwin")
+AC_MSG_RESULT([Using yasm args target "$arch" "$yasm_args"])
+
+# Check for header files
+AC_CHECK_HEADERS([limits.h stdint.h stdlib.h string.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_INLINE
+AC_TYPE_SIZE_T
+AC_TYPE_UINT16_T
+AC_TYPE_UINT32_T
+AC_TYPE_UINT64_T
+AC_TYPE_UINT8_T
+
+# Checks for library functions.
+AC_FUNC_MALLOC # Used only in tests
+AC_CHECK_FUNCS([memmove memset])
+
+my_CFLAGS="\
+-Wall \
+-Wchar-subscripts \
+-Wformat-security \
+-Wnested-externs \
+-Wpointer-arith \
+-Wshadow \
+-Wstrict-prototypes \
+-Wtype-limits \
+"
+AC_SUBST([my_CFLAGS])
+
+AC_CONFIG_FILES([\
+ Makefile\
+ libisal_crypto.pc
+])
+
+AC_OUTPUT
+AC_MSG_RESULT([
+ $PACKAGE $VERSION
+ =====
+
+ prefix: ${prefix}
+ sysconfdir: ${sysconfdir}
+ libdir: ${libdir}
+ includedir: ${includedir}
+
+ compiler: ${CC}
+ cflags: ${CFLAGS}
+ ldflags: ${LDFLAGS}
+
+ debug: ${enable_debug}
+])
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h b/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h
new file mode 100644
index 000000000..bff4a62d8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_cbc.h
@@ -0,0 +1,161 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/**
+ * @file aes_cbc.h
+ * @brief AES CBC encryption/decryption function prototypes.
+ *
+; References:
+ */
+#ifndef _AES_CBC_h
+#define _AES_CBC_h
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+
+#endif
+
+typedef enum cbc_key_size { CBC_128_BITS = 16, CBC_192_BITS = 24, CBC_256_BITS = 32} cbc_key_size;
+#define CBC_ROUND_KEY_LEN (16)
+#define CBC_128_KEY_ROUNDS (10+1) /*expanded key holds 10 key rounds plus original key*/
+#define CBC_192_KEY_ROUNDS (12+1) /*expanded key holds 12 key rounds plus original key*/
+#define CBC_256_KEY_ROUNDS (14+1) /*expanded key holds 14 key rounds plus original key*/
+#define CBC_MAX_KEYS_SIZE (CBC_ROUND_KEY_LEN * CBC_256_KEY_ROUNDS)
+
+#define CBC_IV_DATA_LEN (16)
+
+/** @brief holds intermediate key data used in encryption/decryption
+ *
+ */
+struct cbc_key_data { // must be 16 byte aligned
+ uint8_t enc_keys[CBC_MAX_KEYS_SIZE];
+ uint8_t dec_keys[CBC_MAX_KEYS_SIZE];
+};
+
+/** @brief CBC-AES key pre-computation done once for a key
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ * arg 1: in: pointer to key
+ * arg 2: OUT: pointer to a key expanded data
+ */
+int aes_cbc_precomp(
+ uint8_t *key,
+ int key_size,
+ struct cbc_key_data *keys_blk
+);
+
+/** @brief CBC-AES 128 bit key Decryption
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ * arg 1: in: pointer to input (cipher text)
+ * arg 2: IV: pointer to IV, Must be 16 bytes aligned to a 16 byte boundary
+ * arg 3: keys: pointer to keys, Must be on a 16 byte boundary and length of key size * key rounds
+ * arg 4: OUT: pointer to output (plain text ... in-place allowed)
+ * arg 5: len_bytes: length in bytes (multiple of 16)
+ */
+void aes_cbc_dec_128(
+ void *in,
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+ void *out,
+ uint64_t len_bytes); //!< Must be a multiple of 16 bytes
+
+/** @brief CBC-AES 192 bit key Decryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+void aes_cbc_dec_192(
+ void *in,
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+ void *out,
+ uint64_t len_bytes); //!< Must be a multiple of 16 bytes
+
+/** @brief CBC-AES 256 bit key Decryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+void aes_cbc_dec_256(
+ void *in,
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or dec_keys of cbc_key_data
+ void *out,
+ uint64_t len_bytes); //!< Must be a multiple of 16 bytes
+
+/** @brief CBC-AES 128 bit key Encryption
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ * arg 1: in: pointer to input (plain text)
+ * arg 2: IV: pointer to IV, Must be 16 bytes aligned to a 16 byte boundary
+ * arg 3: keys: pointer to keys, Must be on a 16 byte boundary and length of key size * key rounds
+ * arg 4: OUT: pointer to output (cipher text ... in-place allowed)
+ * arg 5: len_bytes: length in bytes (multiple of 16)
+ */
+int aes_cbc_enc_128(
+ void *in,
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data
+ void *out,
+ uint64_t len_bytes); //!< Must be a multiple of 16 bytes
+
+/** @brief CBC-AES 192 bit key Encryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+int aes_cbc_enc_192(
+ void *in,
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data
+ void *out,
+ uint64_t len_bytes); //!< Must be a multiple of 16 bytes
+
+/** @brief CBC-AES 256 bit key Encryption
+ *
+* @requires SSE4.1 and AESNI
+*
+*/
+int aes_cbc_enc_256(
+ void *in,
+ uint8_t *IV, //!< Must be 16 bytes aligned to a 16 byte boundary
+ uint8_t *keys, //!< Must be on a 16 byte boundary and length of key size * key rounds or enc_keys of cbc_key_data
+ void *out,
+ uint64_t len_bytes); //!< Must be a multiple of 16 bytes
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+#endif //ifndef _AES_CBC_h
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h b/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h
new file mode 100644
index 000000000..1e7127e95
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_gcm.h
@@ -0,0 +1,340 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/**
+ * @file aes_gcm.h
+ * @brief AES GCM encryption/decryption function prototypes.
+ *
+ * At build time there is an option to use non-temporal loads and stores
+ * selected by defining the compile time option NT_LDST. The use of this option
+ * places the following restriction on the gcm encryption functions:
+ *
+ * - The plaintext and cyphertext buffers must be aligned on a 16 byte boundary.
+ *
+ * - When using the streaming API, all partial input buffers must be a multiple
+ * of 16 bytes long except for the last input buffer.
+ *
+ * - In-place encryption/decryption is not recommended.
+ *
+ */
+
+/*
+; References:
+; This code was derived and highly optimized from the code described in paper:
+; Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on Intel Architecture Processors. August, 2010
+;
+; For the shift-based reductions used in this code, we used the method described in paper:
+; Shay Gueron, Michael E. Kounavis. Intel Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode. January, 2010.
+;
+;
+;
+; Assumptions: Support for SSE4.1 or greater, AVX or AVX2
+;
+;
+; iv:
+; 0 1 2 3
+; 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Salt (From the SA) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | Initialization Vector |
+; | (This is the sequence number from IPSec header) |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+; | 0x1 |
+; +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+;
+; TLen:
+; from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+;
+ */
+#ifndef _AES_GCM_h
+#define _AES_GCM_h
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8. */
+#define MAX_TAG_LEN (16)
+//
+// IV data is limited to 16 bytes. The last DWORD (4 bytes) must be 0x1
+//
+#define GCM_IV_LEN (16)
+#define GCM_IV_DATA_LEN (12)
+#define GCM_IV_END_MARK {0x00, 0x00, 0x00, 0x01};
+#define GCM_IV_END_START (12)
+
+#define LONGEST_TESTED_AAD_LENGTH (2* 1024)
+
+// Key lengths of 128 and 256 supported
+#define GCM_128_KEY_LEN (16)
+#define GCM_256_KEY_LEN (32)
+
+#define GCM_BLOCK_LEN 16
+#define GCM_ENC_KEY_LEN 16
+#define GCM_KEY_SETS (15) /*exp key + 14 exp round keys*/
+/** @brief holds intermediate key data needed to improve performance
+ *
+ * gcm_data hold internal key information used by gcm128 and gcm256.
+ */
+struct gcm_data {
+ uint8_t expanded_keys[GCM_ENC_KEY_LEN * GCM_KEY_SETS];
+ uint8_t shifted_hkey_1[GCM_ENC_KEY_LEN]; // store HashKey <<1 mod poly here
+ uint8_t shifted_hkey_2[GCM_ENC_KEY_LEN]; // store HashKey^2 <<1 mod poly here
+ uint8_t shifted_hkey_3[GCM_ENC_KEY_LEN]; // store HashKey^3 <<1 mod poly here
+ uint8_t shifted_hkey_4[GCM_ENC_KEY_LEN]; // store HashKey^4 <<1 mod poly here
+ uint8_t shifted_hkey_5[GCM_ENC_KEY_LEN]; // store HashKey^5 <<1 mod poly here
+ uint8_t shifted_hkey_6[GCM_ENC_KEY_LEN]; // store HashKey^6 <<1 mod poly here
+ uint8_t shifted_hkey_7[GCM_ENC_KEY_LEN]; // store HashKey^7 <<1 mod poly here
+ uint8_t shifted_hkey_8[GCM_ENC_KEY_LEN]; // store HashKey^8 <<1 mod poly here
+ uint8_t shifted_hkey_1_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_2_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_3_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_4_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_5_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_6_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_7_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+ uint8_t shifted_hkey_8_k[GCM_ENC_KEY_LEN]; // store XOR of High 64 bits and Low 64 bits of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+ // init, update and finalize context data
+ uint8_t aad_hash[GCM_BLOCK_LEN];
+ uint64_t aad_length;
+ uint64_t in_length;
+ uint8_t partial_block_enc_key[GCM_BLOCK_LEN];
+ uint8_t orig_IV[GCM_BLOCK_LEN];
+ uint8_t current_counter[GCM_BLOCK_LEN];
+ uint64_t partial_block_length;
+};
+
+/**
+ * @brief GCM-AES Encryption using 128 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm128_enc(struct gcm_data *my_ctx_data,
+ uint8_t * out, //!< Ciphertext output. Encrypt in-place is allowed.
+ uint8_t const *in, //!< Plaintext input
+ uint64_t plaintext_len, //!< Length of data in Bytes for encryption.
+ uint8_t * iv, //!< Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialization Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD).
+ uint64_t aad_len, //!< Length of AAD.
+ uint8_t * auth_tag, //!< Authenticated Tag output.
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). Valid values are 16 (most likely), 12 or 8.
+ );
+
+
+/**
+ * @brief GCM-AES Decryption using 128 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm128_dec(struct gcm_data *my_ctx_data,
+ uint8_t * out, //!< Plaintext output. Decrypt in-place is allowed.
+ uint8_t const *in, //!< Ciphertext input
+ uint64_t plaintext_len, //!< Length of data in Bytes for encryption.
+ uint8_t * iv, //!< Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD).
+ uint64_t aad_len, //!< Length of AAD.
+ uint8_t * auth_tag, //!< Authenticated Tag output.
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). Valid values are 16 (most likely), 12 or 8.
+ );
+
+/**
+ * @brief start a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm128_init( struct gcm_data *my_ctx_data,
+ uint8_t * iv, //!< Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialization Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD).
+ uint64_t aad_len //!< Length of AAD.
+ );
+
+/**
+ * @brief encrypt a block of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm128_enc_update( struct gcm_data *my_ctx_data,
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed.
+ const uint8_t *in, //!< Plaintext input
+ uint64_t plaintext_len //!< Length of data in Bytes for encryption.
+ );
+
+/**
+ * @brief decrypt a block of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm128_dec_update( struct gcm_data *my_ctx_data,
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed.
+ const uint8_t *in, //!< Plaintext input
+ uint64_t plaintext_len //!< Length of data in Bytes for encryption.
+ );
+
+/**
+ * @brief End encryption of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm128_enc_finalize( struct gcm_data *my_ctx_data,
+ uint8_t *auth_tag, //!< Authenticated Tag output.
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+ );
+
+/**
+ * @brief End decryption of a AES-128-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm128_dec_finalize( struct gcm_data *my_ctx_data,
+ uint8_t *auth_tag, //!< Authenticated Tag output.
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+ );
+
+/**
+ * @brief pre-processes key data
+ *
+ * Prefills the gcm data with key values for each round and the initial sub hash key for tag encoding
+ */
+void aesni_gcm128_pre(uint8_t * key, struct gcm_data *gdata
+ );
+
+/**
+ * @brief GCM-AES Encryption using 256 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm256_enc(struct gcm_data *my_ctx_data,
+ uint8_t * out, //!< Ciphertext output. Encrypt in-place is allowed.
+ uint8_t const *in, //!< Plaintext input
+ uint64_t plaintext_len, //!< Length of data in Bytes for encryption.
+ uint8_t * iv, //!< Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialization Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD).
+ uint64_t aad_len, //!< Length of AAD.
+ uint8_t * auth_tag, //!< Authenticated Tag output.
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). Valid values are 16 (most likely), 12 or 8.
+ );
+
+
+/**
+ * @brief GCM-AES Decryption using 256 bit keys
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm256_dec(struct gcm_data *my_ctx_data,
+ uint8_t * out, //!< Plaintext output. Decrypt in-place is allowed.
+ uint8_t const *in, //!< Ciphertext input
+ uint64_t plaintext_len, //!< Length of data in Bytes for encryption.
+ uint8_t * iv, //!< Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD).
+ uint64_t aad_len, //!< Length of AAD.
+ uint8_t * auth_tag, //!< Authenticated Tag output.
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes (must be a multiple of 4 bytes). Valid values are 16 (most likely), 12 or 8.
+ );
+
+/**
+ * @brief start a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm256_init( struct gcm_data *my_ctx_data,
+ uint8_t * iv, //!< Pre-counter block j0: 4 byte salt (from Security Association) concatenated with 8 byte Initialization Vector (from IPSec ESP Payload) concatenated with 0x00000001. 16-byte pointer.
+ uint8_t const *aad, //!< Additional Authentication Data (AAD).
+ uint64_t aad_len //!< Length of AAD.
+ );
+
+/**
+ * @brief encrypt a block of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm256_enc_update( struct gcm_data *my_ctx_data,
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed.
+ const uint8_t *in, //!< Plaintext input
+ uint64_t plaintext_len //!< Length of data in Bytes for encryption.
+ );
+
+/**
+ * @brief decrypt a block of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm256_dec_update( struct gcm_data *my_ctx_data,
+ uint8_t *out, //!< Ciphertext output. Encrypt in-place is allowed.
+ const uint8_t *in, //!< Plaintext input
+ uint64_t plaintext_len //!< Length of data in Bytes for encryption.
+ );
+
+/**
+ * @brief End encryption of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm256_enc_finalize( struct gcm_data *my_ctx_data,
+ uint8_t *auth_tag, //!< Authenticated Tag output.
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+ );
+
+/**
+ * @brief End decryption of a AES-256-GCM Encryption message
+ *
+ * @requires SSE4.1 and AESNI
+ *
+ */
+void aesni_gcm256_dec_finalize( struct gcm_data *my_ctx_data,
+ uint8_t *auth_tag, //!< Authenticated Tag output.
+ uint64_t auth_tag_len //!< Authenticated Tag Length in bytes. Valid values are 16 (most likely), 12 or 8.
+ );
+
+/**
+ * @brief pre-processes key data
+ *
+ * Prefills the gcm data with key values for each round and the initial sub hash key for tag encoding
+ */
+void aesni_gcm256_pre(uint8_t * key, struct gcm_data *gdata);
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+#endif //ifndef _AES_GCM_h
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h b/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h
new file mode 100644
index 000000000..dbbe25d14
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_keyexp.h
@@ -0,0 +1,76 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _KEYEXP_128_H
+#define _KEYEXP_128_H
+
+/**
+ * @file aes_keyexp.h
+ * @brief AES key expansion functions
+ *
+ * This defines the interface to key expansion functions.
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @brief AES key expansion 128 bit
+* @requires SSE4.1
+*/
+void aes_keyexp_128(
+ uint8_t *key, //!< input key for AES-128, 16 bytes
+ uint8_t *exp_key_enc, //!< expanded encryption keys, 16*11 bytes
+ uint8_t *exp_key_dec //!< expanded decryption keys, 16*11 bytes
+ );
+
+/** @brief AES key expansion 192 bit
+* @requires SSE4.1
+*/
+void aes_keyexp_192(
+ uint8_t *key, //!< input key for AES-192, 16*1.5 bytes
+ uint8_t *exp_key_enc, //!< expanded encryption keys, 16*13 bytes
+ uint8_t *exp_key_dec //!< expanded decryption keys, 16*13 bytes
+ );
+
+/** @brief AES key expansion 256 bit
+* @requires SSE4.1
+*/
+void aes_keyexp_256(
+ uint8_t *key, //!< input key for AES-256, 16*2 bytes
+ uint8_t *exp_key_enc, //!< expanded encryption keys, 16*15 bytes
+ uint8_t *exp_key_dec //!< expanded decryption keys, 16*15 bytes
+ );
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+#endif //ifndef _KEYEXP_128_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h b/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h
new file mode 100644
index 000000000..b2d569851
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/aes_xts.h
@@ -0,0 +1,214 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _AES_XTS_H
+#define _AES_XTS_H
+
+/**
+ * @file aes_xts.h
+ * @brief AES XTS encryption function prototypes.
+ *
+ * This defines the interface to optimized AES XTS functions
+
+<b>Pre-expanded keys</b>
+
+For key encryption, pre-expanded keys are stored in the order that they will be
+used. As an example, if Key[0] is the 128-bit initial key used for an AES-128
+encryption, the rest of the keys are stored as follows:
+
+<ul>
+ <li> Key[0] : Initial encryption key
+ <li> Key[1] : Round 1 encryption key
+ <li> Key[2] : Round 2 encryption key
+ <li> ...
+ <li> Key[10] : Round 10 encryption key
+</ul>
+
+For decryption, the order of keys is reversed. However, we apply the
+necessary aesimc instructions before storing the expanded keys. For the same key
+used above, the pre-expanded keys will be stored as follows:
+
+<ul>
+ <li> Key[0] : Round 10 encryption key
+ <li> Key[1] : aesimc(Round 9 encryption key)
+ <li> Key[2] : aesimc(Round 8 encryption key)
+ <li> ...
+ <li> Key[9] : aesimc(Round 1 encryption key)
+ <li> Key[10] : Initial encryption key
+</ul>
+
+<b>Note:</b> The expanded key decryption requires a decryption key only for the block
+decryption step. The tweak step in the expanded key decryption requires the same expanded
+encryption key that is used in the expanded key encryption.
+
+<b>Input and Output Buffers </b>
+
+The input and output buffers can be overlapping as long as the output buffer
+pointer is not less than the input buffer pointer. If the two pointers are the
+same, then encryption/decryption will occur in-place.
+
+<b>Data Length</b>
+
+<ul>
+ <li> The functions support data length of any bytes greater than or equal to 16 bytes.
+ <li> Data length is a 64-bit value, which makes the largest possible data length
+ 2^64 - 1 bytes.
+ <li> For data lengths from 0 to 15 bytes, the functions return without any error
+ codes, without reading or writing any data.
+ <li> The functions only support byte lengths, not bits.
+</ul>
+
+<b>Initial Tweak</b>
+
+The functions accept a 128-bit initial tweak value. The user is responsible for
+padding the initial tweak value to this length.
+
+<b>Data Alignment</b>
+
+The input and output buffers, keys, pre-expanded keys and initial tweak value
+are not required to be aligned to 16 bytes, any alignment works.
+
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** @brief XTS-AES-128 Encryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_enc(
+ uint8_t *k2, //!< key used for tweaking, 16 bytes
+ uint8_t *k1, //!< key used for encryption of tweaked plaintext, 16 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *pt, //!< plaintext sector input data
+ uint8_t *ct //!< ciphertext sector output data
+ );
+
+/** @brief XTS-AES-128 Encryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_enc_expanded_key(
+ uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes
+ uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*11 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *pt, //!< plaintext sector input data
+ uint8_t *ct //!< ciphertext sector output data
+ );
+
+/** @brief XTS-AES-128 Decryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_dec(
+ uint8_t *k2, //!< key used for tweaking, 16 bytes
+ uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *ct, //!< ciphertext sector input data
+ uint8_t *pt //!< plaintext sector output data
+ );
+
+/** @brief XTS-AES-128 Decryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_128_dec_expanded_key(
+ uint8_t *k2, //!< expanded key used for tweaking, 16*11 bytes - encryption key is used
+ uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*11 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *ct, //!< ciphertext sector input data
+ uint8_t *pt //!< plaintext sector output data
+ );
+
+/** @brief XTS-AES-256 Encryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_enc(
+ uint8_t *k2, //!< key used for tweaking, 16*2 bytes
+ uint8_t *k1, //!< key used for encryption of tweaked plaintext, 16*2 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *pt, //!< plaintext sector input data
+ uint8_t *ct //!< ciphertext sector output data
+ );
+
+/** @brief XTS-AES-256 Encryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_enc_expanded_key(
+ uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes
+ uint8_t *k1, //!< expanded key used for encryption of tweaked plaintext, 16*15 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *pt, //!< plaintext sector input data
+ uint8_t *ct //!< ciphertext sector output data
+ );
+
+/** @brief XTS-AES-256 Decryption
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_dec(
+ uint8_t *k2, //!< key used for tweaking, 16*2 bytes
+ uint8_t *k1, //!< key used for decryption of tweaked ciphertext, 16*2 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *ct, //!< ciphertext sector input data
+ uint8_t *pt //!< plaintext sector output data
+ );
+
+/** @brief XTS-AES-256 Decryption with pre-expanded keys
+ * @requires AES-NI
+ */
+
+void XTS_AES_256_dec_expanded_key(
+ uint8_t *k2, //!< expanded key used for tweaking, 16*15 bytes - encryption key is used
+ uint8_t *k1, //!< expanded decryption key used for decryption of tweaked ciphertext, 16*15 bytes
+ uint8_t *TW_initial, //!< initial tweak value, 16 bytes
+ uint64_t N, //!< sector size, in bytes
+ const uint8_t *ct, //!< ciphertext sector input data
+ uint8_t *pt //!< plaintext sector output data
+ );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_AES_XTS_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm b/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm
new file mode 100644
index 000000000..882e497f8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/datastruct.asm
@@ -0,0 +1,79 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Macros for defining data structures
+
+; Usage example
+
+;START_FIELDS ; JOB_AES
+;;; name size align
+;FIELD _plaintext, 8, 8 ; pointer to plaintext
+;FIELD _ciphertext, 8, 8 ; pointer to ciphertext
+;FIELD _IV, 16, 8 ; IV
+;FIELD _keys, 8, 8 ; pointer to keys
+;FIELD _len, 4, 4 ; length in bytes
+;FIELD _status, 4, 4 ; status enumeration
+;FIELD _user_data, 8, 8 ; pointer to user data
+;UNION _union, size1, align1, \
+ size2, align2, \
+ size3, align3, \
+ ...
+;END_FIELDS
+;%assign _JOB_AES_size _FIELD_OFFSET
+;%assign _JOB_AES_align _STRUCT_ALIGN
+
+%ifndef _DATASTRUCT_ASM_
+%define _DATASTRUCT_ASM_
+
+;; START_FIELDS
+%macro START_FIELDS 0
+%assign _FIELD_OFFSET 0
+%assign _STRUCT_ALIGN 0
+%endm
+
+;; FIELD name size align
+%macro FIELD 3
+%define %%name %1
+%define %%size %2
+%define %%align %3
+
+%assign _FIELD_OFFSET (_FIELD_OFFSET + (%%align) - 1) & (~ ((%%align)-1))
+%%name equ _FIELD_OFFSET
+%assign _FIELD_OFFSET _FIELD_OFFSET + (%%size)
+%if (%%align > _STRUCT_ALIGN)
+%assign _STRUCT_ALIGN %%align
+%endif
+%endm
+
+;; END_FIELDS
+%macro END_FIELDS 0
+%assign _FIELD_OFFSET (_FIELD_OFFSET + _STRUCT_ALIGN-1) & (~ (_STRUCT_ALIGN-1))
+%endm
+
+%endif ; end ifdef _DATASTRUCT_ASM_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h b/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h
new file mode 100644
index 000000000..8ddf3b1a5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/intrinreg.h
@@ -0,0 +1,82 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ * @file intrinreg.h
+ * @brief Defines intrinsic types used by the new hashing API
+ *
+ */
+
+#ifndef _IA64_REGS_H_
+#define _IA64_REGS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+#ifndef __SSE__
+# define __ERR_COMPILER_FLAGS
+#endif
+
+#ifdef __ERR_COMPILER_FLAGS
+
+#ifdef __GNUC__
+# error "The compiler must be configured to optimize for speed and support at least SSE instructions."
+# error "Please specify the following gcc flags: -O3 [-mavx]"
+#endif
+
+#ifdef _MSC_VER
+# error "Please compile with ''/O2 /D__SSE__ /arch:SSE'' -OR- ''/O2 /D__AVX__ /arch:AVX''"
+#endif
+
+#endif
+
+#include <stdint.h>
+#include <immintrin.h>
+
+// Define available register types uniformly.
+/// @cond
+typedef struct{ uint8_t dummy; } intrinreg1;
+typedef struct{ uint16_t dummy; } intrinreg2;
+typedef struct{ uint32_t dummy; } intrinreg4;
+typedef struct{ uint64_t dummy; } intrinreg8;
+typedef __m128 intrinreg16;
+/// @endcond
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _IA64_REGS_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h b/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h
new file mode 100644
index 000000000..17a1b36ef
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/md5_mb.h
@@ -0,0 +1,372 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MD5_MB_H_
+#define _MD5_MB_H_
+
+/**
+ * @file md5_mb.h
+ * @brief Multi-buffer CTX API MD5 function prototypes and structures
+ *
+ * Interface for multi-buffer MD5 functions
+ *
+ * <b> Multi-buffer MD5 Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the MD5_HASH_CTX_MGR and
+ * MD5_HASH_CTX objects. Numerous MD5_HASH_CTX objects may be instantiated by the
+ * application for use with a single MD5_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The MD5 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and
+ * AVX512. In addition, a multibinary interface is provided, which selects the appropriate
+ * architecture-specific function at runtime.
+ *
+ * <b>Usage:</b> The application creates a MD5_HASH_CTX_MGR object and initializes it
+ * with a call to md5_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512 (or no suffix for the
+ * multibinary version). The MD5_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 8 MD5_HASH_CTX objects (or 16 in AVX2 case, 32 in AVX512 case)
+ * being processed at a time.
+ *
+ * Each MD5_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the MD5_HASH_CTX to a MD5_HASH_CTX_MGR using the submit functions
+ * md5_ctx_mgr_submit*() with the HASH_FIRST flag set. When the MD5_HASH_CTX is
+ * returned to the application (via this or a later call to md5_ctx_mgr_submit*() or
+ * md5_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * md5_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, md5_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a MD5_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The MD5_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ * <li> one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE)
+ * <li> two buffers: HASH_FIRST, HASH_LAST
+ * <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which MD5_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ * <li> Submitting flags other than the allowed entire/first/update/last values
+ * <li> Submitting a context that is currently being managed by a MD5_HASH_CTX_MGR.
+ * <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ * These error conditions are reported by returning the MD5_HASH_CTX immediately after
+ * a submit with its error member set to a non-zero error code (defined in
+ * multi_buffer.h). No changes are made to the MD5_HASH_CTX_MGR in the case of an
+ * error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define MD5_DIGEST_NWORDS 4
+#define MD5_MAX_LANES 32
+#define MD5_MIN_LANES 8
+#define MD5_BLOCK_SIZE 64
+#define MD5_LOG2_BLOCK_SIZE 6
+#define MD5_PADLENGTHFIELD_SIZE 8
+#define MD5_INITIAL_DIGEST \
+ 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476
+
+typedef uint32_t md5_digest_array[MD5_DIGEST_NWORDS][MD5_MAX_LANES];
+typedef uint32_t MD5_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single MD5 job for the multi-buffer manager */
+
+typedef struct {
+ uint8_t* buffer; //!< pointer to data buffer for this job
+ uint32_t len; //!< length of buffer for this job in blocks.
+ DECLARE_ALIGNED(uint32_t result_digest[MD5_DIGEST_NWORDS],64);
+ JOB_STS status; //!< output job status
+ void* user_data; //!< pointer for user's job-related data
+} MD5_JOB;
+
+/** @brief Scheduler layer - Holds arguments for submitted MD5 job */
+
+typedef struct {
+ md5_digest_array digest;
+ uint8_t* data_ptr[MD5_MAX_LANES];
+} MD5_MB_ARGS_X32;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+ MD5_JOB *job_in_lane;
+} MD5_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer MD5 jobs */
+
+typedef struct {
+ MD5_MB_ARGS_X32 args;
+ uint32_t lens[MD5_MAX_LANES];
+ uint64_t unused_lanes[4]; //!< each byte or nibble is index (0...31 or 15) of unused lanes.
+ MD5_LANE_DATA ldata[MD5_MAX_LANES];
+ uint32_t num_lanes_inuse;
+} MD5_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer MD5 jobs */
+
+typedef struct {
+ MD5_MB_JOB_MGR mgr;
+} MD5_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single MD5 job for the multi-buffer CTX manager */
+
+typedef struct {
+ MD5_JOB job; // Must be at struct offset 0.
+ HASH_CTX_STS status; //!< Context status flag
+ HASH_CTX_ERROR error; //!< Context error flag
+ uint32_t total_length; //!< Running counter of length processed for this CTX's job
+ const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job
+ uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes.
+ uint8_t partial_block_buffer[MD5_BLOCK_SIZE * 2]; //!< CTX partial blocks
+ uint32_t partial_block_buffer_length;
+ void* user_data; //!< pointer for user to keep any job-related data
+} MD5_HASH_CTX;
+
+/*******************************************************************
+ * CTX level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level MD5 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init_sse (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_sse (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_sse (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init_avx (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_avx (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_avx (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init_avx2 (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_avx2 (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_avx2 (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init_avx512 (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit_avx512 (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush_avx512 (MD5_HASH_CTX_MGR* mgr);
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the MD5 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void md5_ctx_mgr_init (MD5_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new MD5 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_submit (MD5_HASH_CTX_MGR* mgr, MD5_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted MD5 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+MD5_HASH_CTX* md5_ctx_mgr_flush (MD5_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void md5_mb_mgr_init_sse (MD5_MB_JOB_MGR *state);
+MD5_JOB* md5_mb_mgr_submit_sse (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_sse (MD5_MB_JOB_MGR *state);
+
+#define md5_mb_mgr_init_avx md5_mb_mgr_init_sse
+MD5_JOB* md5_mb_mgr_submit_avx (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_avx (MD5_MB_JOB_MGR *state);
+
+void md5_mb_mgr_init_avx2 (MD5_MB_JOB_MGR *state);
+MD5_JOB* md5_mb_mgr_submit_avx2 (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_avx2 (MD5_MB_JOB_MGR *state);
+
+void md5_mb_mgr_init_avx512 (MD5_MB_JOB_MGR *state);
+MD5_JOB* md5_mb_mgr_submit_avx512 (MD5_MB_JOB_MGR *state, MD5_JOB* job);
+MD5_JOB* md5_mb_mgr_flush_avx512 (MD5_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _MD5_MB_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm b/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm
new file mode 100644
index 000000000..1342dc1c3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/memcpy.asm
@@ -0,0 +1,346 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifndef __MEMCPY_ASM__
+%define __MEMCPY_ASM__
+
+%include "reg_sizes.asm"
+
+
+; This file defines a series of macros to copy small to medium amounts
+; of data from memory to memory, where the size is variable but limited.
+;
+; The macros are all called as:
+; memcpy DST, SRC, SIZE, TMP0, TMP1, XTMP0, XTMP1, XTMP2, XTMP3
+; with the parameters defined as:
+; DST : register: pointer to dst (not modified)
+; SRC : register: pointer to src (not modified)
+; SIZE : register: length in bytes (not modified)
+; TMP0 : 64-bit temp GPR (clobbered)
+; TMP1 : 64-bit temp GPR (clobbered)
+; XTMP0 : temp XMM (clobbered)
+; XTMP1 : temp XMM (clobbered)
+; XTMP2 : temp XMM (clobbered)
+; XTMP3 : temp XMM (clobbered)
+;
+; The name indicates the options. The name is of the form:
+; memcpy_<VEC>_<SZ><ZERO><RET>
+; where:
+; <VEC> is either "sse" or "avx" or "avx2"
+; <SZ> is either "64" or "128" and defines largest value of SIZE
+; <ZERO> is blank or "_1". If "_1" then the min SIZE is 1 (otherwise 0)
+; <RET> is blank or "_ret". If blank, the code falls through. If "ret"
+; it does a "ret" at the end
+;
+; For the avx2 versions, the temp XMM registers need to be YMM registers
+; If the SZ is 64, then only two YMM temps are needed, i.e. it is called as:
+; memcpy_avx2_64 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1
+; memcpy_avx2_128 DST, SRC, SIZE, TMP0, TMP1, YTMP0, YTMP1, YTMP2, YTMP3
+;
+; For example:
+; memcpy_sse_64 : SSE, 0 <= size < 64, falls through
+; memcpy_avx_64_1 : AVX1, 1 <= size < 64, falls through
+; memcpy_sse_128_ret : SSE, 0 <= size < 128, ends with ret
+; mempcy_avx_128_1_ret : AVX1, 1 <= size < 128, ends with ret
+;
+
+%macro memcpy_sse_64 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 0
+%endm
+
+%macro memcpy_sse_64_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 0
+%endm
+
+%macro memcpy_sse_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 0
+%endm
+
+%macro memcpy_sse_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 0
+%endm
+
+%macro memcpy_sse_64_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 0
+%endm
+
+%macro memcpy_sse_64_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 0
+%endm
+
+%macro memcpy_sse_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 0
+%endm
+
+%macro memcpy_sse_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 0
+%endm
+
+
+%macro memcpy_sse_16 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 0
+%endm
+
+%macro memcpy_sse_16_1 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 0
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx_64 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 0, 1
+%endm
+
+%macro memcpy_avx_64_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 0, 1
+%endm
+
+%macro memcpy_avx_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 0, 1
+%endm
+
+%macro memcpy_avx_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 0, 1
+%endm
+
+%macro memcpy_avx_64_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 64, 1, 1
+%endm
+
+%macro memcpy_avx_64_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 64, 1, 1
+%endm
+
+%macro memcpy_avx_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 0, 128, 1, 1
+%endm
+
+%macro memcpy_avx_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,%8,%9, 1, 128, 1, 1
+%endm
+
+
+%macro memcpy_avx_16 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 0, 16, 0, 1
+%endm
+
+%macro memcpy_avx_16_1 5
+ __memcpy_int %1,%2,%3,%4,%5,,,,, 1, 16, 0, 1
+%endm
+
+ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%macro memcpy_avx2_64 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_64_1 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 0, 2
+%endm
+
+%macro memcpy_avx2_128 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 0, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_128_1 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7, %8, %9, 1, 128, 0, 2
+%endm
+
+%macro memcpy_avx2_64_ret 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_64_1_ret 7
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 64, 1, 2
+%endm
+
+%macro memcpy_avx2_128_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 0, 128, 1, 2
+%endm
+
+%macro memcpy_avx2_128_1_ret 9
+ __memcpy_int %1,%2,%3,%4,%5,%6,%7,--,--, 1, 128, 1, 2
+%endm
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%macro __memcpy_int 13
+%define %%DST %1 ; register: pointer to dst (not modified)
+%define %%SRC %2 ; register: pointer to src (not modified)
+%define %%SIZE %3 ; register: length in bytes (not modified)
+%define %%TMP0 %4 ; 64-bit temp GPR (clobbered)
+%define %%TMP1 %5 ; 64-bit temp GPR (clobbered)
+%define %%XTMP0 %6 ; temp XMM (clobbered)
+%define %%XTMP1 %7 ; temp XMM (clobbered)
+%define %%XTMP2 %8 ; temp XMM (clobbered)
+%define %%XTMP3 %9 ; temp XMM (clobbered)
+%define %%NOT0 %10 ; if not 0, then assume size cannot be zero
+%define %%MAXSIZE %11 ; 128, 64, etc
+%define %%USERET %12 ; if not 0, use "ret" at end
+%define %%USEAVX %13 ; 0 = SSE, 1 = AVX1, 2 = AVX2
+
+%if (%%USERET != 0)
+ %define %%DONE ret
+%else
+ %define %%DONE jmp %%end
+%endif
+
+%if (%%USEAVX != 0)
+ %define %%MOVDQU vmovdqu
+%else
+ %define %%MOVDQU movdqu
+%endif
+
+%if (%%MAXSIZE >= 128)
+ test %%SIZE, 64
+ jz %%lt64
+ %if (%%USEAVX >= 2)
+ %%MOVDQU %%XTMP0, [%%SRC + 0*32]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*32]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*32]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*32]
+
+ %%MOVDQU [%%DST + 0*32], %%XTMP0
+ %%MOVDQU [%%DST + 1*32], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*32], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP3
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*16]
+ %%MOVDQU %%XTMP2, [%%SRC + 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + 3*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + 1*16], %%XTMP1
+ %%MOVDQU [%%DST + 2*16], %%XTMP2
+ %%MOVDQU [%%DST + 3*16], %%XTMP3
+
+ %%MOVDQU %%XTMP0, [%%SRC + %%SIZE - 4*16]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 3*16]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + %%SIZE - 4*16], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 3*16], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 64)
+%%lt64
+ test %%SIZE, 32
+ jz %%lt32
+ %if (%%USEAVX >= 2)
+ %%MOVDQU %%XTMP0, [%%SRC + 0*32]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*32]
+ %%MOVDQU [%%DST + 0*32], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 1*32], %%XTMP1
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + 1*16]
+ %%MOVDQU %%XTMP2, [%%SRC + %%SIZE - 2*16]
+ %%MOVDQU %%XTMP3, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + 1*16], %%XTMP1
+ %%MOVDQU [%%DST + %%SIZE - 2*16], %%XTMP2
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP3
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 32)
+%%lt32:
+ test %%SIZE, 16
+ jz %%lt16
+ %if (%%USEAVX >= 2)
+ %%MOVDQU XWORD(%%XTMP0), [%%SRC + 0*16]
+ %%MOVDQU XWORD(%%XTMP1), [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], XWORD(%%XTMP0)
+ %%MOVDQU [%%DST + %%SIZE - 1*16], XWORD(%%XTMP1)
+ %else
+ %%MOVDQU %%XTMP0, [%%SRC + 0*16]
+ %%MOVDQU %%XTMP1, [%%SRC + %%SIZE - 1*16]
+ %%MOVDQU [%%DST + 0*16], %%XTMP0
+ %%MOVDQU [%%DST + %%SIZE - 1*16], %%XTMP1
+ %endif
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 16)
+%%lt16:
+ test %%SIZE, 8
+ jz %%lt8
+ mov %%TMP0, [%%SRC]
+ mov %%TMP1, [%%SRC + %%SIZE - 8]
+ mov [%%DST], %%TMP0
+ mov [%%DST + %%SIZE - 8], %%TMP1
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 8)
+%%lt8:
+ test %%SIZE, 4
+ jz %%lt4
+ mov DWORD(%%TMP0), [%%SRC]
+ mov DWORD(%%TMP1), [%%SRC + %%SIZE - 4]
+ mov [%%DST], DWORD(%%TMP0)
+ mov [%%DST + %%SIZE - 4], DWORD(%%TMP1)
+ %%DONE
+%endif
+
+%if (%%MAXSIZE >= 4)
+%%lt4:
+ test %%SIZE, 2
+ jz %%lt2
+ movzx DWORD(%%TMP0), word [%%SRC]
+ movzx DWORD(%%TMP1), byte [%%SRC + %%SIZE - 1]
+ mov [%%DST], WORD(%%TMP0)
+ mov [%%DST + %%SIZE - 1], BYTE(%%TMP1)
+ %%DONE
+%endif
+
+%%lt2:
+%if (%%NOT0 == 0)
+ test %%SIZE, 1
+ jz %%end
+%endif
+ movzx DWORD(%%TMP0), byte [%%SRC]
+ mov [%%DST], BYTE(%%TMP0)
+%%end:
+%if (%%USERET != 0)
+ ret
+%endif
+%endm
+
+%endif ; ifndef __MEMCPY_ASM__
diff --git a/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h b/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h
new file mode 100644
index 000000000..b15da3cb1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/memcpy_inline.h
@@ -0,0 +1,363 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ * @file memcpy_inline.h
+ * @brief Defines intrinsic memcpy functions used by the new hashing API
+ *
+ */
+
+#ifndef _MEMCPY_H_
+#define _MEMCPY_H_
+
+#include "intrinreg.h"
+#include <assert.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define memcpy_varlen memcpy_sse_varlen
+#define memcpy_fixedlen memcpy_sse_fixedlen
+
+#define memclr_varlen memclr_sse_varlen
+#define memclr_fixedlen memclr_sse_fixedlen
+
+static inline void memcpy_lte32_sse_fixedlen(void* dst, const void* src, size_t nbytes);
+static inline void memcpy_gte16_sse_fixedlen(void* dst, const void* src, size_t nbytes);
+static inline void memcpy_sse_fixedlen (void* dst, const void* src, size_t nbytes);
+
+static inline void memcpy_lte32_sse_varlen (void* dst, const void* src, size_t nbytes);
+static inline void memcpy_gte16_sse_varlen (void* dst, const void* src, size_t nbytes);
+static inline void memcpy_sse_varlen (void* dst, const void* src, size_t nbytes);
+
+
+static inline void memclr_lte32_sse_fixedlen(void* dst, size_t nbytes);
+static inline void memclr_gte16_sse_fixedlen(void* dst, size_t nbytes);
+static inline void memclr_sse_fixedlen (void* dst, size_t nbytes);
+
+static inline void memclr_lte32_sse_varlen (void* dst, size_t nbytes);
+static inline void memclr_gte16_sse_varlen (void* dst, size_t nbytes);
+static inline void memclr_sse_varlen (void* dst, size_t nbytes);
+
+#define MEMCPY_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, src, nbytes) \
+ do { \
+ intrinreg##N head; \
+ intrinreg##N tail; \
+ assert(N <= nbytes && nbytes <= 2*N); \
+ if(N == 1 || (fixedwidth && nbytes==N) ) { \
+ head = load_intrinreg##N(src); \
+ store_intrinreg##N(dst, head); \
+ } \
+ else { \
+ head = load_intrinreg##N(src); \
+ tail = load_intrinreg##N((const void*)((const char*)src + (nbytes - N))); \
+ store_intrinreg##N(dst, head); \
+ store_intrinreg##N((void*)((char*)dst + (nbytes - N)), tail); \
+ } \
+ } while(0)
+
+#define MEMCLR_BETWEEN_N_AND_2N_BYTES(N, fixedwidth, dst, nbytes) \
+ do { \
+ const intrinreg##N zero = {0}; \
+ assert(N <= nbytes && nbytes <= 2*N); \
+ if(N == 1 || (fixedwidth && nbytes==N) ) { \
+ store_intrinreg##N(dst, zero); \
+ } \
+ else { \
+ store_intrinreg##N(dst, zero); \
+ store_intrinreg##N((void*)((char*)dst + (nbytes - N)), zero); \
+ } \
+ } while(0)
+
+// Define load/store functions uniformly.
+
+#define load_intrinreg16(src) _mm_loadu_ps((const float*) src)
+#define store_intrinreg16(dst,val) _mm_storeu_ps((float*) dst, val)
+
+static inline intrinreg8 load_intrinreg8(const void *src)
+{
+ return *(intrinreg8 *) src;
+}
+
+static inline void store_intrinreg8(void *dst, intrinreg8 val)
+{
+ *(intrinreg8 *) dst = val;
+}
+
+static inline intrinreg4 load_intrinreg4(const void *src)
+{
+ return *(intrinreg4 *) src;
+}
+
+static inline void store_intrinreg4(void *dst, intrinreg4 val)
+{
+ *(intrinreg4 *) dst = val;
+}
+
+static inline intrinreg2 load_intrinreg2(const void *src)
+{
+ return *(intrinreg2 *) src;
+}
+
+static inline void store_intrinreg2(void *dst, intrinreg2 val)
+{
+ *(intrinreg2 *) dst = val;
+}
+
+static inline intrinreg1 load_intrinreg1(const void *src)
+{
+ return *(intrinreg1 *) src;
+}
+
+static inline void store_intrinreg1(void *dst, intrinreg1 val)
+{
+ *(intrinreg1 *) dst = val;
+}
+
+static inline void memcpy_gte16_sse_fixedlen(void *dst, const void *src, size_t nbytes)
+{
+ size_t i;
+ size_t j;
+ intrinreg16 pool[4];
+ size_t remaining_moves;
+ size_t tail_offset;
+ int do_tail;
+ assert(nbytes >= 16);
+
+ for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4) {
+ for (j = 0; j < 4; j++)
+ pool[j] =
+ load_intrinreg16((const void *)((const char *)src + i + 16 * j));
+ for (j = 0; j < 4; j++)
+ store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]);
+ }
+
+ remaining_moves = (nbytes - i) / 16;
+ tail_offset = nbytes - 16;
+ do_tail = (tail_offset & (16 - 1));
+
+ for (j = 0; j < remaining_moves; j++)
+ pool[j] = load_intrinreg16((const void *)((const char *)src + i + 16 * j));
+
+ if (do_tail)
+ pool[j] = load_intrinreg16((const void *)((const char *)src + tail_offset));
+
+ for (j = 0; j < remaining_moves; j++)
+ store_intrinreg16((void *)((char *)dst + i + 16 * j), pool[j]);
+
+ if (do_tail)
+ store_intrinreg16((void *)((char *)dst + tail_offset), pool[j]);
+}
+
+static inline void memclr_gte16_sse_fixedlen(void *dst, size_t nbytes)
+{
+ size_t i;
+ size_t j;
+ const intrinreg16 zero = { 0 };
+ size_t remaining_moves;
+ size_t tail_offset;
+ int do_tail;
+ assert(nbytes >= 16);
+
+ for (i = 0; i + 16 * 4 <= nbytes; i += 16 * 4)
+ for (j = 0; j < 4; j++)
+ store_intrinreg16((void *)((char *)dst + i + 16 * j), zero);
+
+ remaining_moves = (nbytes - i) / 16;
+ tail_offset = nbytes - 16;
+ do_tail = (tail_offset & (16 - 1));
+
+ for (j = 0; j < remaining_moves; j++)
+ store_intrinreg16((void *)((char *)dst + i + 16 * j), zero);
+
+ if (do_tail)
+ store_intrinreg16((void *)((char *)dst + tail_offset), zero);
+}
+
+static inline void memcpy_lte32_sse_fixedlen(void *dst, const void *src, size_t nbytes)
+{
+ assert(nbytes <= 32);
+ if (nbytes >= 16)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 1, dst, src, nbytes);
+ else if (nbytes >= 8)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 1, dst, src, nbytes);
+ else if (nbytes >= 4)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 1, dst, src, nbytes);
+ else if (nbytes >= 2)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 1, dst, src, nbytes);
+ else if (nbytes >= 1)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 1, dst, src, nbytes);
+}
+
+static inline void memclr_lte32_sse_fixedlen(void *dst, size_t nbytes)
+{
+ assert(nbytes <= 32);
+ if (nbytes >= 16)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 1, dst, nbytes);
+ else if (nbytes >= 8)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 1, dst, nbytes);
+ else if (nbytes >= 4)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 1, dst, nbytes);
+ else if (nbytes >= 2)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 1, dst, nbytes);
+ else if (nbytes >= 1)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 1, dst, nbytes);
+}
+
+static inline void memcpy_lte32_sse_varlen(void *dst, const void *src, size_t nbytes)
+{
+ assert(nbytes <= 32);
+ if (nbytes >= 16)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(16, 0, dst, src, nbytes);
+ else if (nbytes >= 8)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(8, 0, dst, src, nbytes);
+ else if (nbytes >= 4)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(4, 0, dst, src, nbytes);
+ else if (nbytes >= 2)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(2, 0, dst, src, nbytes);
+ else if (nbytes >= 1)
+ MEMCPY_BETWEEN_N_AND_2N_BYTES(1, 0, dst, src, nbytes);
+}
+
+static inline void memclr_lte32_sse_varlen(void *dst, size_t nbytes)
+{
+ assert(nbytes <= 32);
+ if (nbytes >= 16)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(16, 0, dst, nbytes);
+ else if (nbytes >= 8)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(8, 0, dst, nbytes);
+ else if (nbytes >= 4)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(4, 0, dst, nbytes);
+ else if (nbytes >= 2)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(2, 0, dst, nbytes);
+ else if (nbytes >= 1)
+ MEMCLR_BETWEEN_N_AND_2N_BYTES(1, 0, dst, nbytes);
+}
+
+static inline void memcpy_gte16_sse_varlen(void *dst, const void *src, size_t nbytes)
+{
+ size_t i = 0;
+ intrinreg16 tail;
+
+ assert(nbytes >= 16);
+
+ while (i + 128 <= nbytes) {
+ memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+ (const void *)((const char *)src + i), 128);
+ i += 128;
+ }
+ if (i + 64 <= nbytes) {
+ memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+ (const void *)((const char *)src + i), 64);
+ i += 64;
+ }
+ if (i + 32 <= nbytes) {
+ memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+ (const void *)((const char *)src + i), 32);
+ i += 32;
+ }
+ if (i + 16 <= nbytes) {
+ memcpy_gte16_sse_fixedlen((void *)((char *)dst + i),
+ (const void *)((const char *)src + i), 16);
+ i += 16;
+ }
+
+ i = nbytes - 16;
+ tail = load_intrinreg16((const void *)((const char *)src + i));
+ store_intrinreg16((void *)((char *)dst + i), tail);
+}
+
+static inline void memclr_gte16_sse_varlen(void *dst, size_t nbytes)
+{
+ size_t i = 0;
+ const intrinreg16 zero = { 0 };
+
+ assert(nbytes >= 16);
+
+ while (i + 128 <= nbytes) {
+ memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 128);
+ i += 128;
+ }
+ if (i + 64 <= nbytes) {
+ memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 64);
+ i += 64;
+ }
+ if (i + 32 <= nbytes) {
+ memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 32);
+ i += 32;
+ }
+ if (i + 16 <= nbytes) {
+ memclr_gte16_sse_fixedlen((void *)((char *)dst + i), 16);
+ i += 16;
+ }
+
+ i = nbytes - 16;
+ store_intrinreg16((void *)((char *)dst + i), zero);
+}
+
+static inline void memcpy_sse_fixedlen(void *dst, const void *src, size_t nbytes)
+{
+ if (nbytes >= 16)
+ memcpy_gte16_sse_fixedlen(dst, src, nbytes);
+ else
+ memcpy_lte32_sse_fixedlen(dst, src, nbytes);
+}
+
+static inline void memclr_sse_fixedlen(void *dst, size_t nbytes)
+{
+ if (nbytes >= 16)
+ memclr_gte16_sse_fixedlen(dst, nbytes);
+ else
+ memclr_lte32_sse_fixedlen(dst, nbytes);
+}
+
+static inline void memcpy_sse_varlen(void *dst, const void *src, size_t nbytes)
+{
+ if (nbytes >= 16)
+ memcpy_gte16_sse_varlen(dst, src, nbytes);
+ else
+ memcpy_lte32_sse_varlen(dst, src, nbytes);
+}
+
+static inline void memclr_sse_varlen(void *dst, size_t nbytes)
+{
+ if (nbytes >= 16)
+ memclr_gte16_sse_varlen(dst, nbytes);
+ else
+ memclr_lte32_sse_varlen(dst, nbytes);
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __MEMCPY_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h
new file mode 100644
index 000000000..ea9bb9ac4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1.h
@@ -0,0 +1,315 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_H_
+#define _MH_SHA1_H_
+
+/**
+ * @file mh_sha1.h
+ * @brief mh_sha1 function prototypes and structures
+ *
+ * Interface for mh_sha1 functions
+ *
+ * <b> mh_sha1 Init-Update..Update-Finalize </b>
+ *
+ * This file defines the interface to optimized functions used in mh_sha1.
+ * The definition of multi-hash SHA1(mh_sha1, for short) is: Pad the buffer
+ * in SHA1 style until the total length is a multiple of 4*16*16
+ * (words-width * parallel-segments * block-size); Hash the buffer in
+ * parallel, generating digests of 4*16*5 (words-width*parallel-segments*
+ * digest-size); Treat the set of digests as another data buffer, and
+ * generate a final SHA1 digest for it.
+ *
+ *
+ * Example
+ * \code
+ * uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS];
+ * struct mh_sha1_ctx *ctx;
+ *
+ * ctx = malloc(sizeof(struct mh_sha1_ctx));
+ * mh_sha1_init(ctx);
+ * mh_sha1_update(ctx, buff, block_len);
+ * mh_sha1_finalize(ctx, mh_sha1_digest);
+ * \endcode
+ */
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// External Interface Definition
+#define HASH_SEGS 16
+#define SHA1_BLOCK_SIZE 64
+#define MH_SHA1_BLOCK_SIZE (HASH_SEGS * SHA1_BLOCK_SIZE)
+#define SHA1_DIGEST_WORDS 5
+#define AVX512_ALIGNED 64
+
+/** @brief Holds info describing a single mh_sha1
+ *
+ * It is better to use heap to allocate this data structure to avoid stack overflow.
+ *
+*/
+struct mh_sha1_ctx {
+ uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]; //!< the digest of multi-hash SHA1
+
+ uint64_t total_length;
+ //!< Parameters for update feature, describe the lengths of input buffers in bytes
+ uint8_t partial_block_buffer [MH_SHA1_BLOCK_SIZE * 2];
+ //!< Padding the tail of input data for SHA1
+ uint8_t mh_sha1_interim_digests[sizeof(uint32_t) * SHA1_DIGEST_WORDS * HASH_SEGS];
+ //!< Storing the SHA1 interim digests of all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose.
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE + AVX512_ALIGNED];
+ //!< Re-structure sha1 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose.
+};
+
+/**
+ * @enum mh_sha1_ctx_error
+ * @brief CTX error flags
+ */
+enum mh_sha1_ctx_error{
+ MH_SHA1_CTX_ERROR_NONE = 0, //!< MH_SHA1_MURMUR3_CTX_ERROR_NONE
+ MH_SHA1_CTX_ERROR_NULL = -1, //!< MH_SHA1_MURMUR3_CTX_ERROR_NULL
+};
+
+
+/*******************************************************************
+ * mh_sha1 API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the mh_sha1_ctx structure.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_init (struct mh_sha1_ctx* ctx);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_update (struct mh_sha1_ctx * ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for multi-hash sha1.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_finalize (struct mh_sha1_ctx* ctx, void* mh_sha1_digest);
+
+/*******************************************************************
+ * multi-types of mh_sha1 internal API
+ *
+ * XXXX The multi-binary version
+ * XXXX_base The C code version which used to display the algorithm
+ * XXXX_sse The version uses a ASM function optimized for SSE
+ * XXXX_avx The version uses a ASM function optimized for AVX
+ * XXXX_avx2 The version uses a ASM function optimized for AVX2
+ * XXXX_avx512 The version uses a ASM function optimized for AVX512
+ *
+ ******************************************************************/
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * Base update() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_base (struct mh_sha1_ctx* ctx, const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_sse (struct mh_sha1_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_avx (struct mh_sha1_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_avx2 (struct mh_sha1_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Multi-hash sha1 update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_update_avx512 (struct mh_sha1_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+
+/**
+ * @brief Finalize the message digests for multi-hash sha1.
+ *
+ * Place the message digests in mh_sha1_digest,
+ * which must have enough space for the outputs.
+ * Base Finalize() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_base (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_sse (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_avx (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_avx2 (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digest in mh_sha1_digest which must have enough space
+ * for the outputs.
+ *
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha1 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_finalize_avx512 (struct mh_sha1_ctx* ctx,
+ void* mh_sha1_digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h
new file mode 100644
index 000000000..0c84650e2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/mh_sha1_murmur3_x64_128.h
@@ -0,0 +1,327 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_MURMUR3_X64_128_H_
+#define _MH_SHA1_MURMUR3_X64_128_H_
+
+/**
+ * @file mh_sha1_murmur3_x64_128.h
+ * @brief mh_sha1_murmur3_x64_128 function prototypes and structures
+ *
+ * Interface for mh_sha1_murmur3_x64_128 functions
+ *
+ * <b> mh_sha1_murmur3_x64_128 Init-Update..Update-Finalize </b>
+ *
+ * This file defines the interface to optimized functions used in mh_sha1 and
+ * mh_sha1_murmur3_x64_128. The definition of multi-hash SHA1(mh_sha1,
+ * for short) is: Pad the buffer in SHA1 style until the total length is a multiple
+ * of 4*16*16(words-width * parallel-segments * block-size); Hash the buffer
+ * in parallel, generating digests of 4*16*5 (words-width*parallel-segments*
+ * digest-size); Treat the set of digests as another data buffer, and generate
+ * a final SHA1 digest for it. mh_sha1_murmur3_x64_128 is a stitching function
+ * which will get a murmur3_x64_128 digest while generate mh_sha1 digest.
+ *
+ *
+ * Example
+ * \code
+ * uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS];
+ * uint32_t murmur_digest[MURMUR3_x64_128_DIGEST_WORDS];
+ * struct mh_sha1_murmur3_x64_128_ctx *ctx;
+ *
+ * ctx = malloc(sizeof(struct mh_sha1_murmur3_x64_128_ctx));
+ * mh_sha1_murmur3_x64_128_init(ctx, 0);
+ * mh_sha1_murmur3_x64_128_update(ctx, buff, block_len);
+ * mh_sha1_murmur3_x64_128_finalize(ctx, mh_sha1_digest,
+ * murmur_digest);
+ * \endcode
+ */
+
+#include <stdint.h>
+#include "mh_sha1.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// External Interface Definition
+// Add murmur3_x64_128 definition
+#define MUR_BLOCK_SIZE (2 * sizeof(uint64_t))
+#define MURMUR3_x64_128_DIGEST_WORDS 4
+
+/** @brief Holds info describing a single mh_sha1_murmur3_x64_128
+ *
+ * It is better to use heap to allocate this data structure to avoid stack overflow.
+ *
+*/
+struct mh_sha1_murmur3_x64_128_ctx {
+ uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]; //!< the digest of multi-hash SHA1
+ uint32_t murmur3_x64_128_digest[MURMUR3_x64_128_DIGEST_WORDS]; //!< the digest of murmur3_x64_128
+
+ uint64_t total_length;
+ //!< Parameters for update feature, describe the lengths of input buffers in bytes
+ uint8_t partial_block_buffer [MH_SHA1_BLOCK_SIZE * 2];
+ //!< Padding the tail of input data for SHA1
+ uint8_t mh_sha1_interim_digests[sizeof(uint32_t) * SHA1_DIGEST_WORDS * HASH_SEGS];
+ //!< Storing the SHA1 interim digests of all 16 segments. Each time, it will be copied to stack for 64-byte alignment purpose.
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE + AVX512_ALIGNED];
+ //!< Re-structure sha1 block data from different segments to fit big endian. Use AVX512_ALIGNED for 64-byte alignment purpose.
+};
+
+/**
+ * @enum mh_sha1_murmur3_ctx_error
+ * @brief CTX error flags
+ */
+enum mh_sha1_murmur3_ctx_error{
+ MH_SHA1_MURMUR3_CTX_ERROR_NONE = 0, //!< MH_SHA1_MURMUR3_CTX_ERROR_NONE
+ MH_SHA1_MURMUR3_CTX_ERROR_NULL = -1, //!<MH_SHA1_MURMUR3_CTX_ERROR_NULL
+};
+
+
+/*******************************************************************
+ * mh_sha1_murmur3_x64_128 API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the mh_sha1_murmur3_x64_128_ctx structure.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param murmur_seed Seed as an initial digest of murmur3
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_murmur3_x64_128_init (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ uint64_t murmur_seed);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_murmur3_x64_128_update (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ */
+int mh_sha1_murmur3_x64_128_finalize (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/*******************************************************************
+ * multi-types of mh_sha1_murmur3_x64_128 internal API
+ *
+ * XXXX The multi-binary version
+ * XXXX_base The C code version which used to display the algorithm
+ * XXXX_sse The version uses a ASM function optimized for SSE
+ * XXXX_avx The version uses a ASM function optimized for AVX
+ * XXXX_avx2 The version uses a ASM function optimized for AVX2
+ *
+ ******************************************************************/
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * Base update() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_base (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_sse (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_avx (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_avx2 (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Combined multi-hash and murmur hash update.
+ *
+ * Can be called repeatedly to update hashes with new input data.
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_update_avx512 (struct mh_sha1_murmur3_x64_128_ctx * ctx,
+ const void* buffer, uint32_t len);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ * Base Finalize() function that does not require SIMD support.
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_base (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires SSE
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_sse (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires AVX
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_avx (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires AVX2
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_avx2 (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+/**
+ * @brief Finalize the message digests for combined multi-hash and murmur.
+ *
+ * Place the message digests in mh_sha1_digest and murmur3_x64_128_digest,
+ * which must have enough space for the outputs.
+ *
+ * @requires AVX512
+ *
+ * @param ctx Structure holding mh_sha1_murmur3_x64_128 info
+ * @param mh_sha1_digest The digest of mh_sha1
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns int Return 0 if the function runs without errors
+ *
+ */
+int mh_sha1_murmur3_x64_128_finalize_avx512 (struct mh_sha1_murmur3_x64_128_ctx* ctx,
+ void* mh_sha1_digest, void* murmur3_x64_128_digest);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h b/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h
new file mode 100644
index 000000000..d9b713575
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/multi_buffer.h
@@ -0,0 +1,127 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MULTI_BUFFER_H_
+#define _MULTI_BUFFER_H_
+
+/**
+ * @file multi_buffer.h
+ * @brief Multi-buffer common fields
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __FreeBSD__
+#include <sys/types.h>
+#include <sys/endian.h>
+# define _byteswap_uint64(x) bswap64(x)
+# define _byteswap_ulong(x) bswap32(x)
+#elif defined (__APPLE__)
+#include <libkern/OSByteOrder.h>
+# define _byteswap_uint64(x) OSSwapInt64(x)
+# define _byteswap_ulong(x) OSSwapInt32(x)
+#elif defined (__GNUC__) && !defined (__MINGW32__)
+# include <byteswap.h>
+# define _byteswap_uint64(x) bswap_64(x)
+# define _byteswap_ulong(x) bswap_32(x)
+#endif
+
+/**
+ * @enum JOB_STS
+ * @brief Job return codes
+ */
+
+typedef enum {STS_UNKNOWN = 0, //!< STS_UNKNOWN
+ STS_BEING_PROCESSED = 1,//!< STS_BEING_PROCESSED
+ STS_COMPLETED = 2, //!< STS_COMPLETED
+ STS_INTERNAL_ERROR, //!< STS_INTERNAL_ERROR
+ STS_ERROR //!< STS_ERROR
+} JOB_STS;
+
+#define HASH_MB_NO_FLAGS 0
+#define HASH_MB_FIRST 1
+#define HASH_MB_LAST 2
+
+/* Common flags for the new API only
+ * */
+
+/**
+ * @enum HASH_CTX_FLAG
+ * @brief CTX job type
+ */
+typedef enum {
+ HASH_UPDATE = 0x00, //!< HASH_UPDATE
+ HASH_FIRST = 0x01, //!< HASH_FIRST
+ HASH_LAST = 0x02, //!< HASH_LAST
+ HASH_ENTIRE = 0x03, //!< HASH_ENTIRE
+} HASH_CTX_FLAG;
+
+/**
+ * @enum HASH_CTX_STS
+ * @brief CTX status flags
+ */
+typedef enum {
+ HASH_CTX_STS_IDLE = 0x00, //!< HASH_CTX_STS_IDLE
+ HASH_CTX_STS_PROCESSING = 0x01, //!< HASH_CTX_STS_PROCESSING
+ HASH_CTX_STS_LAST = 0x02, //!< HASH_CTX_STS_LAST
+ HASH_CTX_STS_COMPLETE = 0x04, //!< HASH_CTX_STS_COMPLETE
+} HASH_CTX_STS;
+
+/**
+ * @enum HASH_CTX_ERROR
+ * @brief CTX error flags
+ */
+typedef enum {
+ HASH_CTX_ERROR_NONE = 0, //!< HASH_CTX_ERROR_NONE
+ HASH_CTX_ERROR_INVALID_FLAGS = -1, //!< HASH_CTX_ERROR_INVALID_FLAGS
+ HASH_CTX_ERROR_ALREADY_PROCESSING = -2, //!< HASH_CTX_ERROR_ALREADY_PROCESSING
+ HASH_CTX_ERROR_ALREADY_COMPLETED = -3, //!< HASH_CTX_ERROR_ALREADY_COMPLETED
+} HASH_CTX_ERROR;
+
+
+#define hash_ctx_user_data(ctx) ((ctx)->user_data)
+#define hash_ctx_digest(ctx) ((ctx)->job.result_digest)
+#define hash_ctx_processing(ctx) ((ctx)->status & HASH_CTX_STS_PROCESSING)
+#define hash_ctx_complete(ctx) ((ctx)->status == HASH_CTX_STS_COMPLETE)
+#define hash_ctx_status(ctx) ((ctx)->status)
+#define hash_ctx_error(ctx) ((ctx)->error)
+#define hash_ctx_init(ctx) \
+ do { \
+ (ctx)->error = HASH_CTX_ERROR_NONE; \
+ (ctx)->status = HASH_CTX_STS_COMPLETE; \
+ } while(0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _MULTI_BUFFER_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm b/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm
new file mode 100644
index 000000000..45f87a9b1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/multibinary.asm
@@ -0,0 +1,271 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _MULTIBINARY_ASM_
+%define _MULTIBINARY_ASM_
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ %define mbin_def_ptr dd
+ %define mbin_ptr_sz dword
+ %define mbin_rdi edi
+ %define mbin_rsi esi
+ %define mbin_rax eax
+ %define mbin_rbx ebx
+ %define mbin_rcx ecx
+ %define mbin_rdx edx
+%else
+ %define mbin_def_ptr dq
+ %define mbin_ptr_sz qword
+ %define mbin_rdi rdi
+ %define mbin_rsi rsi
+ %define mbin_rax rax
+ %define mbin_rbx rbx
+ %define mbin_rcx rcx
+ %define mbin_rdx rdx
+%endif
+
+;;;;
+; multibinary macro:
+; creates the visable entry point that uses HW optimized call pointer
+; creates the init of the HW optimized call pointer
+;;;;
+%macro mbin_interface 1
+ ;;;;
+ ; *_dispatched is defaulted to *_mbinit and replaced on first call.
+ ; Therefore, *_dispatch_init is only executed on first call.
+ ;;;;
+ section .data
+ %1_dispatched:
+ mbin_def_ptr %1_mbinit
+
+ section .text
+ global %1:function
+ %1_mbinit:
+ ;;; only called the first time to setup hardware match
+ call %1_dispatch_init
+ ;;; falls thru to execute the hw optimized code
+ %1:
+ jmp mbin_ptr_sz [%1_dispatched]
+%endmacro
+
+;;;;;
+; mbin_dispatch_init parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+;;;;;
+%macro mbin_dispatch_init 4
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+ mov eax, 1
+ cpuid
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+ jne _%1_init_done ; AVX is not available so end
+ mov mbin_rsi, mbin_rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _%1_init_done
+ lea mbin_rsi, [%2 WRT_OPT]
+
+ _%1_init_done:
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init2 parameters
+; Cases where only base functions are available
+; 1-> function name
+; 2-> base function
+;;;;;
+%macro mbin_dispatch_init2 2
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init5 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+;;;;;
+%macro mbin_dispatch_init5 5
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ ; Test for SSE4.1
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
+ jne _%1_init_done ; AVX is not available so end
+ mov mbin_rsi, mbin_rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _%1_init_done
+ lea mbin_rsi, [%3 WRT_OPT]
+
+ _%1_init_done:
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init6 parameters
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+;;;;;
+%macro mbin_dispatch_init6 6
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ je _%1_init_done ; Use base function if no SSE4_1
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+ lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_ECX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_ECX_AVX512_G1
+ lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+%endif ; ifndef _MULTIBINARY_ASM_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm b/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm
new file mode 100644
index 000000000..64064a1a2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/reg_sizes.asm
@@ -0,0 +1,149 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifndef _REG_SIZES_ASM_
+%define _REG_SIZES_ASM_
+
+%ifdef __NASM_VER__
+%ifidn __OUTPUT_FORMAT__, win64
+%error nasm not supported in windows
+%else
+%define endproc_frame
+%endif
+%endif
+
+%define EFLAGS_HAS_CPUID (1<<21)
+%define FLAG_CPUID1_ECX_CLMUL (1<<1)
+%define FLAG_CPUID1_EDX_SSE2 (1<<26)
+%define FLAG_CPUID1_ECX_SSE3 (1)
+%define FLAG_CPUID1_ECX_SSE4_1 (1<<19)
+%define FLAG_CPUID1_ECX_SSE4_2 (1<<20)
+%define FLAG_CPUID1_ECX_POPCNT (1<<23)
+%define FLAG_CPUID1_ECX_AESNI (1<<25)
+%define FLAG_CPUID1_ECX_OSXSAVE (1<<27)
+%define FLAG_CPUID1_ECX_AVX (1<<28)
+%define FLAG_CPUID1_EBX_AVX2 (1<<5)
+
+%define FLAG_CPUID7_EBX_AVX2 (1<<5)
+%define FLAG_CPUID7_EBX_AVX512F (1<<16)
+%define FLAG_CPUID7_EBX_AVX512DQ (1<<17)
+%define FLAG_CPUID7_EBX_AVX512IFMA (1<<21)
+%define FLAG_CPUID7_EBX_AVX512PF (1<<26)
+%define FLAG_CPUID7_EBX_AVX512ER (1<<27)
+%define FLAG_CPUID7_EBX_AVX512CD (1<<28)
+%define FLAG_CPUID7_EBX_AVX512BW (1<<30)
+%define FLAG_CPUID7_EBX_AVX512VL (1<<31)
+%define FLAG_CPUID7_ECX_AVX512VBMI (1<<1)
+
+%define FLAGS_CPUID7_ECX_AVX512_G1 (FLAG_CPUID7_EBX_AVX512F | FLAG_CPUID7_EBX_AVX512VL | FLAG_CPUID7_EBX_AVX512BW | FLAG_CPUID7_EBX_AVX512CD | FLAG_CPUID7_EBX_AVX512DQ)
+
+%define FLAG_XGETBV_EAX_XMM (1<<1)
+%define FLAG_XGETBV_EAX_YMM (1<<2)
+%define FLAG_XGETBV_EAX_XMM_YMM 0x6
+%define FLAG_XGETBV_EAX_ZMM_OPM 0xe0
+
+%define FLAG_CPUID1_EAX_AVOTON 0x000406d0
+%define FLAG_CPUID1_EAX_STEP_MASK 0xfffffff0
+
+; define d and w variants for registers
+
+%define raxd eax
+%define raxw ax
+%define raxb al
+
+%define rbxd ebx
+%define rbxw bx
+%define rbxb bl
+
+%define rcxd ecx
+%define rcxw cx
+%define rcxb cl
+
+%define rdxd edx
+%define rdxw dx
+%define rdxb dl
+
+%define rsid esi
+%define rsiw si
+%define rsib sil
+
+%define rdid edi
+%define rdiw di
+%define rdib dil
+
+%define rbpd ebp
+%define rbpw bp
+%define rbpb bpl
+
+%define ymm0x xmm0
+%define ymm1x xmm1
+%define ymm2x xmm2
+%define ymm3x xmm3
+%define ymm4x xmm4
+%define ymm5x xmm5
+%define ymm6x xmm6
+%define ymm7x xmm7
+%define ymm8x xmm8
+%define ymm9x xmm9
+%define ymm10x xmm10
+%define ymm11x xmm11
+%define ymm12x xmm12
+%define ymm13x xmm13
+%define ymm14x xmm14
+%define ymm15x xmm15
+
+%define DWORD(reg) reg %+ d
+%define WORD(reg) reg %+ w
+%define BYTE(reg) reg %+ b
+
+%define XWORD(reg) reg %+ x
+
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+section .text
+%endif
+%ifidn __OUTPUT_FORMAT__, macho64
+%define elf64 macho64
+%endif
+
+%macro slversion 4
+ section .text
+ global %1_slver_%2%3%4
+ global %1_slver
+ %1_slver:
+ %1_slver_%2%3%4:
+ dw 0x%4
+ db 0x%3, 0x%2
+%endmacro
+
+%endif ; ifndef _REG_SIZES_ASM_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h
new file mode 100644
index 000000000..7ddeb45f3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sha1_mb.h
@@ -0,0 +1,377 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SHA1_MB_H_
+#define _SHA1_MB_H_
+
+/**
+ * @file sha1_mb.h
+ * @brief Multi-buffer CTX API SHA1 function prototypes and structures
+ *
+ * Interface for multi-buffer SHA1 functions
+ *
+ * <b> Multi-buffer SHA1 Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the SHA1_HASH_CTX_MGR and
+ * SHA1_HASH_CTX objects. Numerous SHA1_HASH_CTX objects may be instantiated by the
+ * application for use with a single SHA1_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The SHA1 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and
+ * AVX512. In addition, a multibinary interface is provided, which selects the appropriate
+ * architecture-specific function at runtime.
+ *
+ * <b>Usage:</b> The application creates a SHA1_HASH_CTX_MGR object and initializes it
+ * with a call to sha1_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the
+ * multibinary version). The SHA1_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 4 SHA1_HASH_CTX objects (or 8 in the AVX2 case, 16 in the AVX512)
+ * being processed at a time.
+ *
+ * Each SHA1_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the SHA1_HASH_CTX to a SHA1_HASH_CTX_MGR using the submit functions
+ * sha1_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA1_HASH_CTX is
+ * returned to the application (via this or a later call to sha1_ctx_mgr_submit*() or
+ * sha1_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * sha1_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, sha1_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a SHA1_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The SHA1_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ * <li> one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE)
+ * <li> two buffers: HASH_FIRST, HASH_LAST
+ * <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which SHA1_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ * <li> Submitting flags other than the allowed entire/first/update/last values
+ * <li> Submitting a context that is currently being managed by a SHA1_HASH_CTX_MGR.
+ * <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ * These error conditions are reported by returning the SHA1_HASH_CTX immediately after
+ * a submit with its error member set to a non-zero error code (defined in
+ * multi_buffer.h). No changes are made to the SHA1_HASH_CTX_MGR in the case of an
+ * error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define SHA1_DIGEST_NWORDS 5
+#define SHA1_MAX_LANES 16
+#define SHA1_X8_LANES 8
+#define SHA1_MIN_LANES 4
+#define SHA1_BLOCK_SIZE 64
+#define SHA1_LOG2_BLOCK_SIZE 6
+#define SHA1_PADLENGTHFIELD_SIZE 8
+#define SHA1_INITIAL_DIGEST \
+ 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0
+
+typedef uint32_t sha1_digest_array[SHA1_DIGEST_NWORDS][SHA1_MAX_LANES];
+typedef uint32_t SHA1_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SHA1 job for the multi-buffer manager */
+
+typedef struct {
+ uint8_t* buffer; //!< pointer to data buffer for this job
+ uint32_t len; //!< length of buffer for this job in blocks.
+ DECLARE_ALIGNED(uint32_t result_digest[SHA1_DIGEST_NWORDS],64);
+ JOB_STS status; //!< output job status
+ void* user_data; //!< pointer for user's job-related data
+} SHA1_JOB;
+
+/** @brief Scheduler layer - Holds arguments for submitted SHA1 job */
+
+typedef struct {
+ sha1_digest_array digest;
+ uint8_t* data_ptr[SHA1_MAX_LANES];
+} SHA1_MB_ARGS_X16;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+ SHA1_JOB *job_in_lane;
+} SHA1_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SHA1 jobs */
+
+typedef struct {
+ SHA1_MB_ARGS_X16 args;
+ uint32_t lens[SHA1_MAX_LANES];
+ uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7 or 0...15) of unused lanes, nibble 4 or 8 is set to F as a flag
+ SHA1_LANE_DATA ldata[SHA1_MAX_LANES];
+ uint32_t num_lanes_inuse;
+} SHA1_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SHA1 jobs */
+
+typedef struct {
+ SHA1_MB_JOB_MGR mgr;
+} SHA1_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SHA1 job for the multi-buffer CTX manager */
+
+typedef struct {
+ SHA1_JOB job; // Must be at struct offset 0.
+ HASH_CTX_STS status; //!< Context status flag
+ HASH_CTX_ERROR error; //!< Context error flag
+ uint32_t total_length; //!< Running counter of length processed for this CTX's job
+ const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job
+ uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes.
+ uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2]; //!< CTX partial blocks
+ uint32_t partial_block_buffer_length;
+ void* user_data; //!< pointer for user to keep any job-related data
+} SHA1_HASH_CTX;
+
+/*******************************************************************
+ * Context level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level SHA1 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_sse (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_sse (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_sse (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_avx (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_avx2 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx2 (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx2 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init_avx512 (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit_avx512 (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush_avx512 (SHA1_HASH_CTX_MGR* mgr);
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the SHA1 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha1_ctx_mgr_init (SHA1_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA1 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_submit (SHA1_HASH_CTX_MGR* mgr, SHA1_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA1 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA1_HASH_CTX* sha1_ctx_mgr_flush (SHA1_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void sha1_mb_mgr_init_sse (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_sse (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_sse (SHA1_MB_JOB_MGR *state);
+
+#define sha1_mb_mgr_init_avx sha1_mb_mgr_init_sse
+SHA1_JOB* sha1_mb_mgr_submit_avx (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx (SHA1_MB_JOB_MGR *state);
+
+void sha1_mb_mgr_init_avx2 (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_avx2 (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx2 (SHA1_MB_JOB_MGR *state);
+
+void sha1_mb_mgr_init_avx512 (SHA1_MB_JOB_MGR *state);
+SHA1_JOB* sha1_mb_mgr_submit_avx512 (SHA1_MB_JOB_MGR *state, SHA1_JOB* job);
+SHA1_JOB* sha1_mb_mgr_flush_avx512 (SHA1_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _SHA1_MB_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h
new file mode 100644
index 000000000..cd48508d8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sha256_mb.h
@@ -0,0 +1,376 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SHA256_MB_H_
+#define _SHA256_MB_H_
+
+/**
+ * @file sha256_mb.h
+ * @brief Multi-buffer CTX API SHA256 function prototypes and structures
+ *
+ * Interface for multi-buffer SHA256 functions
+ *
+ * <b> Multi-buffer SHA256 Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the SHA256_HASH_CTX_MGR and
+ * SHA256_HASH_CTX objects. Numerous SHA256_HASH_CTX objects may be instantiated by the
+ * application for use with a single SHA256_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The SHA256 CTX interface functions are available for 4 architectures: SSE, AVX, AVX2 and
+ * AVX512. In addition, a multibinary interface is provided, which selects the appropriate
+ * architecture-specific function at runtime.
+ *
+ * <b>Usage:</b> The application creates a SHA256_HASH_CTX_MGR object and initializes it
+ * with a call to sha256_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the
+ * multibinary version). The SHA256_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 4 SHA256_HASH_CTX objects (or 8 in the AVX2 case, 16 in the AVX512)
+ * being processed at a time.
+ *
+ * Each SHA256_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the SHA256_HASH_CTX to a SHA256_HASH_CTX_MGR using the submit functions
+ * sha256_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA256_HASH_CTX is
+ * returned to the application (via this or a later call to sha256_ctx_mgr_submit*() or
+ * sha256_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * sha256_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, sha256_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a SHA256_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The SHA256_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ * <li> one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE)
+ * <li> two buffers: HASH_FIRST, HASH_LAST
+ * <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which SHA256_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ * <li> Submitting flags other than the allowed entire/first/update/last values
+ * <li> Submitting a context that is currently being managed by a SHA256_HASH_CTX_MGR.
+ * <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ * These error conditions are reported by returning the SHA256_HASH_CTX immediately after
+ * a submit with its error member set to a non-zero error code (defined in
+ * multi_buffer.h). No changes are made to the SHA256_HASH_CTX_MGR in the case of an
+ * error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define SHA256_DIGEST_NWORDS 8
+#define SHA256_MAX_LANES 16
+#define SHA256_X8_LANES 8
+#define SHA256_MIN_LANES 4
+#define SHA256_BLOCK_SIZE 64
+#define SHA256_LOG2_BLOCK_SIZE 6
+#define SHA256_PADLENGTHFIELD_SIZE 8
+#define SHA256_INITIAL_DIGEST \
+ 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, \
+ 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+
+typedef uint32_t sha256_digest_array[SHA256_DIGEST_NWORDS][SHA256_MAX_LANES];
+typedef uint32_t SHA256_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SHA256 job for the multi-buffer manager */
+
+typedef struct {
+ uint8_t* buffer; //!< pointer to data buffer for this job
+ uint64_t len; //!< length of buffer for this job in blocks.
+ DECLARE_ALIGNED(uint32_t result_digest[SHA256_DIGEST_NWORDS], 64);
+ JOB_STS status; //!< output job status
+ void* user_data; //!< pointer for user's job-related data
+} SHA256_JOB;
+
+/** @brief Scheduler layer - Holds arguments for submitted SHA256 job */
+
+typedef struct {
+ sha256_digest_array digest;
+ uint8_t* data_ptr[SHA256_MAX_LANES];
+} SHA256_MB_ARGS_X16;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+ SHA256_JOB *job_in_lane;
+} SHA256_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SHA256 jobs */
+
+typedef struct {
+ SHA256_MB_ARGS_X16 args;
+ uint32_t lens[SHA256_MAX_LANES];
+ uint64_t unused_lanes; //!< each nibble is index (0...3 or 0...7) of unused lanes, nibble 4 or 8 is set to F as a flag
+ SHA256_LANE_DATA ldata[SHA256_MAX_LANES];
+ uint32_t num_lanes_inuse;
+} SHA256_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SHA256 jobs */
+
+typedef struct {
+ SHA256_MB_JOB_MGR mgr;
+} SHA256_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SHA256 job for the multi-buffer CTX manager */
+
+typedef struct {
+ SHA256_JOB job; // Must be at struct offset 0.
+ HASH_CTX_STS status; //!< Context status flag
+ HASH_CTX_ERROR error; //!< Context error flag
+ uint32_t total_length; //!< Running counter of length processed for this CTX's job
+ const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job
+ uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes.
+ uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2]; //!< CTX partial blocks
+ uint32_t partial_block_buffer_length;
+ void* user_data; //!< pointer for user to keep any job-related data
+} SHA256_HASH_CTX;
+
+/*******************************************************************
+ * CTX level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level SHA256 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_sse (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_sse (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_sse (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_avx (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_avx2 (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx2 (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx2 (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init_avx512 (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit_avx512 (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush_avx512 (SHA256_HASH_CTX_MGR* mgr);
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the SHA256 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha256_ctx_mgr_init (SHA256_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA256 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_submit (SHA256_HASH_CTX_MGR* mgr, SHA256_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA256 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA256_HASH_CTX* sha256_ctx_mgr_flush (SHA256_HASH_CTX_MGR* mgr);
+
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void sha256_mb_mgr_init_sse(SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_sse(SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_sse(SHA256_MB_JOB_MGR *state);
+
+#define sha256_mb_mgr_init_avx sha256_mb_mgr_init_sse
+SHA256_JOB* sha256_mb_mgr_submit_avx (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx (SHA256_MB_JOB_MGR *state);
+
+void sha256_mb_mgr_init_avx2 (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_avx2 (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx2 (SHA256_MB_JOB_MGR *state);
+
+void sha256_mb_mgr_init_avx512 (SHA256_MB_JOB_MGR *state);
+SHA256_JOB* sha256_mb_mgr_submit_avx512 (SHA256_MB_JOB_MGR *state, SHA256_JOB* job);
+SHA256_JOB* sha256_mb_mgr_flush_avx512 (SHA256_MB_JOB_MGR *state);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _SHA256_MB_H_
diff --git a/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h b/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h
new file mode 100644
index 000000000..aee6156f1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/sha512_mb.h
@@ -0,0 +1,422 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _SHA512_MB_H_
+#define _SHA512_MB_H_
+
+/**
+ * @file sha512_mb.h
+ * @brief Single/Multi-buffer CTX API SHA512 function prototypes and structures
+ *
+ * Interface for single and multi-buffer SHA512 functions
+ *
+ * <b> Single/Multi-buffer SHA512 Entire or First-Update..Update-Last </b>
+ *
+ * The interface to this single/multi-buffer hashing code is carried out through the
+ * context-level (CTX) init, submit and flush functions and the SHA512_HASH_CTX_MGR and
+ * SHA512_HASH_CTX objects. Numerous SHA512_HASH_CTX objects may be instantiated by the
+ * application for use with a single SHA512_HASH_CTX_MGR.
+ *
+ * The CTX interface functions carry out the initialization and padding of the jobs
+ * entered by the user and add them to the multi-buffer manager. The lower level "scheduler"
+ * layer then processes the jobs in an out-of-order manner. The scheduler layer functions
+ * are internal and are not intended to be invoked directly. Jobs can be submitted
+ * to a CTX as a complete buffer to be hashed, using the HASH_ENTIRE flag, or as partial
+ * jobs which can be started using the HASH_FIRST flag, and later resumed or finished
+ * using the HASH_UPDATE and HASH_LAST flags respectively.
+ *
+ * <b>Note:</b> The submit function does not require data buffers to be block sized.
+ *
+ * The SHA512 CTX interface functions are available for 5 architectures: multi-buffer SSE,
+ * AVX, AVX2, AVX512 and single-buffer SSE4 (which is used in the same way as the
+ * multi-buffer code). In addition, a multibinary interface is provided, which selects the
+ * appropriate architecture-specific function at runtime. This multibinary interface
+ * selects the single buffer SSE4 functions when the platform is detected to be Silvermont.
+ *
+ * <b>Usage:</b> The application creates a SHA512_HASH_CTX_MGR object and initializes it
+ * with a call to sha512_ctx_mgr_init*() function, where henceforth "*" stands for the
+ * relevant suffix for each architecture; _sse, _avx, _avx2, _avx512(or no suffix for the
+ * multibinary version). The SHA512_HASH_CTX_MGR object will be used to schedule processor
+ * resources, with up to 2 SHA512_HASH_CTX objects (or 4 in the AVX2 case, 8 in the AVX512
+ * case) being processed at a time.
+ *
+ * Each SHA512_HASH_CTX must be initialized before first use by the hash_ctx_init macro
+ * defined in multi_buffer.h. After initialization, the application may begin computing
+ * a hash by giving the SHA512_HASH_CTX to a SHA512_HASH_CTX_MGR using the submit functions
+ * sha512_ctx_mgr_submit*() with the HASH_FIRST flag set. When the SHA512_HASH_CTX is
+ * returned to the application (via this or a later call to sha512_ctx_mgr_submit*() or
+ * sha512_ctx_mgr_flush*()), the application can then re-submit it with another call to
+ * sha512_ctx_mgr_submit*(), but without the HASH_FIRST flag set.
+ *
+ * Ideally, on the last buffer for that hash, sha512_ctx_mgr_submit_sse is called with
+ * HASH_LAST, although it is also possible to submit the hash with HASH_LAST and a zero
+ * length if necessary. When a SHA512_HASH_CTX is returned after having been submitted with
+ * HASH_LAST, it will contain a valid hash. The SHA512_HASH_CTX can be reused immediately
+ * by submitting with HASH_FIRST.
+ *
+ * For example, you would submit hashes with the following flags for the following numbers
+ * of buffers:
+ * <ul>
+ * <li> one buffer: HASH_FIRST | HASH_LAST (or, equivalently, HASH_ENTIRE)
+ * <li> two buffers: HASH_FIRST, HASH_LAST
+ * <li> three buffers: HASH_FIRST, HASH_UPDATE, HASH_LAST
+ * etc.
+ * </ul>
+ *
+ * The order in which SHA512_CTX objects are returned is in general different from the order
+ * in which they are submitted.
+ *
+ * A few possible error conditions exist:
+ * <ul>
+ * <li> Submitting flags other than the allowed entire/first/update/last values
+ * <li> Submitting a context that is currently being managed by a SHA512_HASH_CTX_MGR. (Note:
+ * This error case is not applicable to the single buffer SSE4 version)
+ * <li> Submitting a context after HASH_LAST is used but before HASH_FIRST is set.
+ * </ul>
+ *
+ * These error conditions are reported by returning the SHA512_HASH_CTX immediately after
+ * a submit with its error member set to a non-zero error code (defined in
+ * multi_buffer.h). No changes are made to the SHA512_HASH_CTX_MGR in the case of an
+ * error; no processing is done for other hashes.
+ *
+ */
+
+#include <stdint.h>
+#include "multi_buffer.h"
+#include "types.h"
+
+#ifndef _MSC_VER
+#include <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Hash Constants and Typedefs
+#define SHA512_DIGEST_NWORDS 8
+#define SHA512_MAX_LANES 8
+#define SHA512_X4_LANES 4
+#define SHA512_MIN_LANES 2
+#define SHA512_BLOCK_SIZE 128
+#define SHA512_LOG2_BLOCK_SIZE 7
+#define SHA512_PADLENGTHFIELD_SIZE 16
+#define SHA512_INITIAL_DIGEST \
+ 0x6a09e667f3bcc908,0xbb67ae8584caa73b,0x3c6ef372fe94f82b,0xa54ff53a5f1d36f1, \
+ 0x510e527fade682d1,0x9b05688c2b3e6c1f,0x1f83d9abfb41bd6b,0x5be0cd19137e2179
+
+
+typedef uint64_t sha512_digest_array[SHA512_DIGEST_NWORDS][SHA512_MAX_LANES];
+typedef uint64_t SHA512_WORD_T;
+
+/** @brief Scheduler layer - Holds info describing a single SHA512 job for the multi-buffer manager */
+
+typedef struct {
+ uint8_t* buffer; //!< pointer to data buffer for this job
+ uint64_t len; //!< length of buffer for this job in blocks.
+ DECLARE_ALIGNED(uint64_t result_digest[SHA512_DIGEST_NWORDS], 64);
+ JOB_STS status; //!< output job status
+ void* user_data; //!< pointer for user's job-related data
+} SHA512_JOB;
+
+/** @brief Scheduler layer - Holds arguments for submitted SHA512 job */
+
+typedef struct {
+ sha512_digest_array digest;
+ uint8_t* data_ptr[SHA512_MAX_LANES];
+} SHA512_MB_ARGS_X8;
+
+/** @brief Scheduler layer - Lane data */
+
+typedef struct {
+ SHA512_JOB *job_in_lane;
+} SHA512_LANE_DATA;
+
+/** @brief Scheduler layer - Holds state for multi-buffer SHA512 jobs */
+
+typedef struct {
+ SHA512_MB_ARGS_X8 args;
+ uint64_t lens[SHA512_MAX_LANES];
+ uint64_t unused_lanes; //!< each byte is index (00, 01 or 00...03) of unused lanes, byte 2 or 4 is set to FF as a flag
+ SHA512_LANE_DATA ldata[SHA512_MAX_LANES];
+ uint32_t num_lanes_inuse;
+} SHA512_MB_JOB_MGR;
+
+/** @brief Context layer - Holds state for multi-buffer SHA512 jobs */
+
+typedef struct {
+ SHA512_MB_JOB_MGR mgr;
+} SHA512_HASH_CTX_MGR;
+
+/** @brief Context layer - Holds info describing a single SHA512 job for the multi-buffer CTX manager */
+
+typedef struct {
+ SHA512_JOB job; // Must be at struct offset 0.
+ HASH_CTX_STS status; //!< Context status flag
+ HASH_CTX_ERROR error; //!< Context error flag
+ uint32_t total_length; //!< Running counter of length processed for this CTX's job
+ const void* incoming_buffer; //!< pointer to data input buffer for this CTX's job
+ uint32_t incoming_buffer_length; //!< length of buffer for this job in bytes.
+ uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2]; //!< CTX partial blocks
+ uint32_t partial_block_buffer_length;
+ void* user_data; //!< pointer for user to keep any job-related data
+} SHA512_HASH_CTX;
+
+/*******************************************************************
+ * Context level API function prototypes
+ ******************************************************************/
+
+/**
+ * @brief Initialize the context level SHA512 multi-buffer manager structure.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_sse (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the context level multi-buffer manager.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_sse (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires SSE4.1
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_sse (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_avx (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires AVX
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_avx2 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx2 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires AVX2
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx2 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_avx512 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_avx512 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_avx512 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires SSE4
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init_sb_sse4 (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires SSE4
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit_sb_sse4 (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires SSE4
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush_sb_sse4 (SHA512_HASH_CTX_MGR* mgr);
+
+/******************** multibinary function prototypes **********************/
+
+/**
+ * @brief Initialize the SHA512 multi-buffer manager structure.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns void
+ */
+void sha512_ctx_mgr_init (SHA512_HASH_CTX_MGR* mgr);
+
+/**
+ * @brief Submit a new SHA512 job to the multi-buffer manager.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @param ctx Structure holding ctx job info
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param flags Input flag specifying job type (first, update, last or entire)
+ * @returns NULL if no jobs complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_submit (SHA512_HASH_CTX_MGR* mgr, SHA512_HASH_CTX* ctx,
+ const void* buffer, uint32_t len, HASH_CTX_FLAG flags);
+
+/**
+ * @brief Finish all submitted SHA512 jobs and return when complete.
+ * @requires SSE4.1 or AVX or AVX2 or AVX512
+ *
+ * @param mgr Structure holding context level state info
+ * @returns NULL if no jobs to complete or pointer to jobs structure.
+ */
+SHA512_HASH_CTX* sha512_ctx_mgr_flush (SHA512_HASH_CTX_MGR* mgr);
+
+/*******************************************************************
+ * Scheduler (internal) level out-of-order function prototypes
+ ******************************************************************/
+
+void sha512_mb_mgr_init_sse (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_mb_mgr_submit_sse (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_sse (SHA512_MB_JOB_MGR *state);
+
+#define sha512_mb_mgr_init_avx sha512_mb_mgr_init_sse
+SHA512_JOB* sha512_mb_mgr_submit_avx (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_avx (SHA512_MB_JOB_MGR *state);
+
+void sha512_mb_mgr_init_avx2 (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_mb_mgr_submit_avx2 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_avx2 (SHA512_MB_JOB_MGR *state);
+
+void sha512_mb_mgr_init_avx512 (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_mb_mgr_submit_avx512 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_mb_mgr_flush_avx512 (SHA512_MB_JOB_MGR *state);
+
+// Single buffer SHA512 APIs, optimized for SLM.
+void sha512_sse4 (const void* M, void* D, uint64_t L);
+// Note that these APIs comply with multi-buffer APIs' high level usage
+void sha512_sb_mgr_init_sse4 (SHA512_MB_JOB_MGR *state);
+SHA512_JOB* sha512_sb_mgr_submit_sse4 (SHA512_MB_JOB_MGR *state, SHA512_JOB* job);
+SHA512_JOB* sha512_sb_mgr_flush_sse4 (SHA512_MB_JOB_MGR *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _SHA512_MB_H_
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/include/test.h b/src/crypto/isa-l/isa-l_crypto/include/test.h
new file mode 100644
index 000000000..41a21626e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/test.h
@@ -0,0 +1,81 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+#ifndef _TEST_H
+#define _TEST_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Use sys/time.h functions for time
+
+#include <sys/time.h>
+
+struct perf{
+ struct timeval tv;
+};
+
+
+inline int perf_start(struct perf *p)
+{
+ return gettimeofday(&(p->tv), 0);
+}
+inline int perf_stop(struct perf *p)
+{
+ return gettimeofday(&(p->tv), 0);
+}
+
+inline void perf_print(struct perf stop, struct perf start, long long dsize)
+{
+ long long secs = stop.tv.tv_sec - start.tv.tv_sec;
+ long long usecs = secs * 1000000 + stop.tv.tv_usec - start.tv.tv_usec;
+
+ printf("runtime = %10lld usecs", usecs);
+ if (dsize != 0) {
+#if 1 // not bug in printf for 32-bit
+ printf(", bandwidth %lld MB in %.4f sec = %.2f MB/s\n", dsize/(1024*1024),
+ ((double) usecs)/1000000, ((double) dsize) / (double)usecs);
+#else
+ printf(", bandwidth %lld MB ", dsize/(1024*1024));
+ printf("in %.4f sec ",(double)usecs/1000000);
+ printf("= %.2f MB/s\n", (double)dsize/usecs);
+#endif
+ }
+ else
+ printf("\n");
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _TEST_H
diff --git a/src/crypto/isa-l/isa-l_crypto/include/types.h b/src/crypto/isa-l/isa-l_crypto/include/types.h
new file mode 100644
index 000000000..caf3bac59
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/include/types.h
@@ -0,0 +1,71 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+
+/**
+ * @file types.h
+ * @brief Defines common align and debug macros
+ *
+ */
+
+#ifndef __TYPES_H
+#define __TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#if defined __unix__ || defined __APPLE__
+# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+# define __forceinline static inline
+# define aligned_free(x) free(x)
+#else
+# ifdef __MINGW32__
+# define DECLARE_ALIGNED(decl, alignval) decl __attribute__((aligned(alignval)))
+# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+# define aligned_free(x) _aligned_free(x)
+# else
+# define DECLARE_ALIGNED(decl, alignval) __declspec(align(alignval)) decl
+# define posix_memalign(p, algn, len) (NULL == (*((char**)(p)) = (void*) _aligned_malloc(len, algn)))
+# define aligned_free(x) _aligned_free(x)
+# endif
+#endif
+
+#ifdef DEBUG
+# define DEBUG_PRINT(x) printf x
+#else
+# define DEBUG_PRINT(x) do {} while (0)
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //__TYPES_H
diff --git a/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def b/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def
new file mode 100644
index 000000000..4301794a1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/isa-l_crypto.def
@@ -0,0 +1,59 @@
+LIBRARY isa-l_crypto
+EXPORTS
+
+sha1_ctx_mgr_init @1
+sha1_ctx_mgr_submit @2
+sha1_ctx_mgr_flush @3
+sha256_ctx_mgr_init @4
+sha256_ctx_mgr_submit @5
+sha256_ctx_mgr_flush @6
+sha512_ctx_mgr_init @7
+sha512_ctx_mgr_submit @8
+sha512_ctx_mgr_flush @9
+md5_ctx_mgr_init @10
+md5_ctx_mgr_submit @11
+md5_ctx_mgr_flush @12
+mh_sha1_init @13
+mh_sha1_update @14
+mh_sha1_finalize @15
+mh_sha1_finalize_base @16
+mh_sha1_update_base @17
+mh_sha1_murmur3_x64_128_init @18
+mh_sha1_murmur3_x64_128_finalize_base @19
+mh_sha1_murmur3_x64_128_update_base @20
+mh_sha1_murmur3_x64_128_update @21
+mh_sha1_murmur3_x64_128_finalize @22
+aes_keyexp_128 @23
+aes_keyexp_192 @24
+aes_keyexp_256 @25
+aesni_gcm128_enc @26
+aesni_gcm128_dec @27
+aesni_gcm128_init @28
+aesni_gcm128_enc_update @29
+aesni_gcm128_dec_update @30
+aesni_gcm128_enc_finalize @31
+aesni_gcm128_dec_finalize @32
+aesni_gcm256_enc @33
+aesni_gcm256_dec @34
+aesni_gcm256_init @35
+aesni_gcm256_enc_update @36
+aesni_gcm256_dec_update @37
+aesni_gcm256_enc_finalize @38
+aesni_gcm256_dec_finalize @39
+aesni_gcm256_pre @40
+aesni_gcm128_pre @41
+aes_cbc_enc_128 @42
+aes_cbc_dec_128 @43
+aes_cbc_enc_192 @44
+aes_cbc_dec_192 @45
+aes_cbc_enc_256 @46
+aes_cbc_dec_256 @47
+aes_cbc_precomp @48
+XTS_AES_128_enc @49
+XTS_AES_128_enc_expanded_key @50
+XTS_AES_128_dec @51
+XTS_AES_128_dec_expanded_key @52
+XTS_AES_256_enc @53
+XTS_AES_256_enc_expanded_key @54
+XTS_AES_256_dec @55
+XTS_AES_256_dec_expanded_key @56
diff --git a/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in b/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in
new file mode 100644
index 000000000..41ba8d5a3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/libisal_crypto.pc.in
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: libisal_crypto
+Description: Crypto library for storage systems
+Version: @VERSION@
+Libs: -L${libdir} -lisal_crypto
+Libs.private:
+Cflags: -I${includedir}
diff --git a/src/crypto/isa-l/isa-l_crypto/make.inc b/src/crypto/isa-l/isa-l_crypto/make.inc
new file mode 100644
index 000000000..906f9de4f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/make.inc
@@ -0,0 +1,239 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+
+# Makefile include for optimized libraries
+# make targets:
+# lib - build library of optimized functions
+# slib - build shared library
+# test - run unit tests of functions
+# perf - run performance tests
+# install - install headers and libs to system location
+# sim - run on simulator
+# trace - get simulator trace
+# clean - remove object files
+
+version ?= #auto filled on release
+
+CC = gcc
+AS = yasm
+SIM = sde $(SIMFLAGS) --
+
+DEBUG = -g
+DEBUG_yasm = -g dwarf2
+DEBUG_nasm = -g
+
+# Default arch= build options
+CFLAGS_gcc = -Wall
+ASFLAGS_ = -f elf64
+ARFLAGS_ = cr $@
+STRIP_gcc = strip -d -R .comment $@
+
+# arch=32 build options
+ASFLAGS_32 = -f elf32
+CFLAGS_32 = -m32
+ARFLAGS_32 = cr $@
+
+# arch=win64 build options
+ASFLAGS_win64 = -f win64
+CFLAGS_icl = -Qstd=c99
+ARFLAGS_win64 = -out:$@
+
+# arch=mingw build options
+ASFLAGS_mingw = -f win64
+ARFLAGS_mingw = cr $@
+lsrcmingw = $(lsrc)
+unit_testsmingw = $(unit_tests)
+examplesmingw = $(examples)
+perf_testsmingw = $(perf_tests)
+
+ifeq ($(arch),mingw)
+ CC=x86_64-w64-mingw32-gcc
+ AR=x86_64-w64-mingw32-ar
+ LDFLAGS = -Wl,--force-exe-suffix
+endif
+
+
+INCLUDE = $(patsubst %,-I%/,$(subst :, ,$(VPATH)))
+CFLAGS = $(CFLAGS_$(arch)) $(CFLAGS_$(CC)) $(DEBUG) -O2 $(DEFINES) $(INCLUDE)
+ASFLAGS = $(ASFLAGS_$(arch)) $(ASFLAGS_$(CC)) $(DEBUG_$(AS)) $(DEFINES) $(INCLUDE)
+ARFLAGS = $(ARFLAGS_$(arch))
+DEFINES += $(addprefix -D , $D)
+
+O = bin
+lobj += $(patsubst %.c,%.o,$(patsubst %.asm,%.o,$(lsrc$(arch)) $(lsrc_intrinsic)))
+objs = $(addprefix $(O)/,$(notdir $(lobj)))
+
+
+lib_name ?= isa-l_crypto.a
+default: lib slib
+
+# Defaults for windows build
+ifeq ($(arch),win64)
+ AR=lib
+ CC=cl
+ OUTPUT_OPTION = -Fo$@
+ DEBUG=
+ lib_name := $(basename $(lib_name)).lib
+endif
+lsrcwin64 = $(lsrc)
+unit_testswin64 = $(unit_tests)
+exampleswin64 = $(examples)
+perf_testswin64 = $(perf_tests)
+
+# Build and run unit tests, performance tests, etc.
+all_tests = $(notdir $(sort $(perf_tests$(arch)) $(check_tests$(arch)) $(unit_tests$(arch)) $(examples$(arch)) $(other_tests)))
+all_unit_tests = $(notdir $(sort $(check_tests$(arch)) $(unit_tests$(arch))))
+all_perf_tests = $(notdir $(sort $(perf_tests$(arch))))
+
+$(all_unit_tests): % : %.c $(lib_name)
+$(all_perf_tests): % : %.c $(lib_name)
+$(sort $(notdir $(examples$(arch)))): % : %.c $(lib_name)
+$(sort $(notdir $(other_tests))): % : %.c $(lib_name)
+
+sim test trace: $(addsuffix .run,$(all_unit_tests))
+perf: $(addsuffix .run,$(all_perf_tests))
+ex: $(notdir $(examples$(arch)))
+all: lib $(all_tests)
+other: $(notdir $(other_tests))
+tests: $(all_unit_tests)
+perfs: $(all_perf_tests)
+check test perf: SIM=
+trace: SIMFLAGS = -debugtrace
+check test sim:
+ @echo Finished running $@
+
+$(objs): | $(O)
+$(O): ; mkdir -p $(O)
+
+# Build rule to run tests
+$(addsuffix .run,$(all_tests)): %.run : %
+ $(SIM) ./$<
+ @echo Completed run: $<
+
+# Other build rules
+msg = $(if $(DEBUG),DEBUG) $(patsubst 32,32-bit,$(arch)) $D
+
+$(O)/%.o: %.asm
+ @echo " ---> Building $< $(msg)"
+ @$(AS) $(ASFLAGS) -o $@ $<
+
+$(O)/%.o %.o: %.c
+ @echo " ---> Building $< $(msg)"
+ @$(COMPILE.c) $(OUTPUT_OPTION) $<
+
+$(all_tests):
+ @echo " ---> Building Test $@ $(msg)"
+ @$(LINK.o) $(CFLAGS) $^ $(LDLIBS) -o $@
+
+
+# Target to build lib files
+lib: $(lib_name)
+ifneq ($(lib_debug),1)
+ $(lib_name): DEBUG_$(AS)= # Don't put debug symbols in the lib
+ $(lib_name): DEBUG=
+ $(lib_name): DEFINES+=-D NDEBUG
+endif
+ifeq ($(lib_debug),1)
+ DEBUG+=-D DEBUG # Define DEBUG for macros
+endif
+
+#lib $(lib_name): $(lib_name)(${objs})
+$(lib_name): $(objs)
+ @echo " ---> Creating Lib $@"
+ @$(AR) $(ARFLAGS) $^
+ @$(STRIP_$(CC))
+
+
+# Target for shared lib
+so_lib_name = bin/libisal_crypto.so
+so_lib_inst = $(notdir $(so_lib_name))
+so_lib_ver = $(so_lib_inst).$(version)
+soname = $(so_lib_inst).$(word 1, $(subst ., ,$(version)))
+
+slib: $(so_lib_name)
+aobjs += $(addprefix $(O)/,$(patsubst %.asm,%.o,$(filter %.asm,$(notdir $(lsrc$(arch)) $(lsrc_intrinsic)))))
+shared_objs += $(addprefix $(O)/shared_ver_,$(patsubst %.c,%.o,$(filter %.c,$(notdir $(lsrc$(arch)) $(lsrc_intrinsic)))))
+
+$(O)/shared_ver_%.o: %.c
+ @echo " ---> Building shared $< $(msg)"
+ @$(COMPILE.c) $(OUTPUT_OPTION) $<
+
+ifneq ($(lib_debug),1)
+ $(so_lib_name): DEBUG_$(AS)=
+ $(so_lib_name): DEBUG=
+ $(so_lib_name): DEFINES+=-D NDEBUG
+endif
+
+$(shared_objs): CFLAGS += -fPIC
+$(shared_objs) $(aobjs): | $(O)
+$(so_lib_name): LDFLAGS+=-Wl,-soname,$(soname)
+$(so_lib_name): $(shared_objs) $(aobjs)
+ @echo " ---> Creating Shared Lib $@"
+ @$(CC) $(CFLAGS) --shared $(LDFLAGS) -o $@ $^
+ @(cd $(@D); ln -f -s $(so_lib_inst) $(soname))
+
+# Target for install
+prefix = /usr/local
+install_dirs = $(prefix)/lib $(prefix)/include/isa-l_crypto
+$(install_dirs): ; mkdir -p $@
+install: $(sort $(extern_hdrs)) | $(install_dirs) $(lib_name) $(so_lib_name) isa-l_crypto.h
+ install -m 644 $(lib_name) $(prefix)/lib/libisal_crypto.a
+ install -m 644 $^ $(prefix)/include/isa-l_crypto/.
+ install -m 664 include/isa-l_crypto.h $(prefix)/include/.
+ install -m 664 $(so_lib_name) $(prefix)/lib/$(so_lib_ver)
+ (cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(soname) && ln -f -s $(so_lib_ver) $(so_lib_inst))
+ifeq ($(shell uname),Darwin)
+ (cd $(prefix)/lib && ln -f -s $(so_lib_ver) $(basename $(so_lib_inst)).dylib)
+endif
+ which libtool && libtool --mode=finish $(prefix)/lib || \
+ echo 'Lib installed at $(prefix)/lib. Run system-dependent programs to add shared lib path.'
+
+uninstall:
+ $(RM) $(prefix)/lib/libisal_crypto.a
+ $(RM) $(prefix)/lib/$(soname)
+ $(RM) $(prefix)/lib/$(so_lib_ver)
+ $(RM) $(prefix)/lib/$(so_lib_inst)
+ $(RM) -r $(prefix)/include/isa-l_crypto
+ $(RM) $(prefix)/include/isa-l_crypto.h
+ $(RM) $(prefix)/lib/$(basename $(so_lib_inst)).dylib
+
+# Collect performance data
+rpt_name = perf_report_$(shell uname -n)_$(shell date +%y%m%d).perf
+
+perf_report:
+ echo Results for $(rpt_name) >> $(rpt_name)
+ $(MAKE) -f Makefile.unx -k perf | tee -a $(rpt_name)
+ @echo Summary:
+ -grep runtime $(rpt_name)
+
+
+clean:
+ @echo Cleaning up
+ @$(RM) -r $(O) *.o *.a $(all_tests) $(lib_name) $(so_lib_name)
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am
new file mode 100644
index 000000000..8001e4310
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/Makefile.am
@@ -0,0 +1,83 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc += md5_mb/md5_ctx_sse.c \
+ md5_mb/md5_ctx_avx.c \
+ md5_mb/md5_ctx_avx2.c
+
+lsrc += md5_mb/md5_mb_mgr_init_sse.c \
+ md5_mb/md5_mb_mgr_init_avx2.c \
+ md5_mb/md5_mb_mgr_init_avx512.c
+
+lsrc += md5_mb/md5_mb_mgr_submit_sse.asm \
+ md5_mb/md5_mb_mgr_submit_avx.asm \
+ md5_mb/md5_mb_mgr_submit_avx2.asm \
+ md5_mb/md5_mb_mgr_flush_sse.asm \
+ md5_mb/md5_mb_mgr_flush_avx.asm \
+ md5_mb/md5_mb_mgr_flush_avx2.asm \
+ md5_mb/md5_mb_x4x2_sse.asm \
+ md5_mb/md5_mb_x4x2_avx.asm \
+ md5_mb/md5_mb_x8x2_avx2.asm \
+ md5_mb/md5_multibinary.asm
+
+lsrc += md5_mb/md5_mb_mgr_submit_avx512.asm \
+ md5_mb/md5_mb_mgr_flush_avx512.asm \
+ md5_mb/md5_mb_x16x2_avx512.asm \
+ md5_mb/md5_ctx_avx512.c
+
+extern_hdrs += include/md5_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ md5_mb/md5_job.asm \
+ md5_mb/md5_mb_mgr_datastruct.asm \
+ md5_mb/md5_ref.c \
+ include/reg_sizes.asm \
+ include/multibinary.asm \
+ include/memcpy_inline.h \
+ include/intrinreg.h
+
+check_tests += md5_mb/md5_mb_test \
+ md5_mb/md5_mb_rand_test \
+ md5_mb/md5_mb_rand_update_test
+
+unit_tests += md5_mb/md5_mb_rand_ssl_test
+
+perf_tests += md5_mb/md5_mb_vs_ossl_perf
+
+
+md5_mb_rand_test: md5_ref.o
+md5_mb_md5_mb_rand_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la
+md5_mb_rand_update_test: md5_ref.o
+md5_mb_md5_mb_rand_update_test_LDADD = md5_mb/md5_ref.lo libisal_crypto.la
+md5_mb_rand_ssl_test: LDLIBS += -lcrypto
+md5_mb_md5_mb_rand_ssl_test_LDFLAGS = -lcrypto
+md5_mb_vs_ossl_perf: LDLIBS += -lcrypto
+md5_mb_md5_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c
new file mode 100644
index 000000000..2125be63b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_avx(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (MD5_BLOCK_SIZE - 1);
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx_slver_02020183;
+struct slver md5_ctx_mgr_init_avx_slver = { 0x0183, 0x02, 0x02 };
+
+struct slver md5_ctx_mgr_submit_avx_slver_02020184;
+struct slver md5_ctx_mgr_submit_avx_slver = { 0x0184, 0x02, 0x02 };
+
+struct slver md5_ctx_mgr_flush_avx_slver_02020185;
+struct slver md5_ctx_mgr_flush_avx_slver = { 0x0185, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c
new file mode 100644
index 000000000..71618a3c8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx2.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx2(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx2(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx2(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (MD5_BLOCK_SIZE - 1);
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx2_slver_04020186;
+struct slver md5_ctx_mgr_init_avx2_slver = { 0x0186, 0x02, 0x04 };
+
+struct slver md5_ctx_mgr_submit_avx2_slver_04020187;
+struct slver md5_ctx_mgr_submit_avx2_slver = { 0x0187, 0x02, 0x04 };
+
+struct slver md5_ctx_mgr_flush_avx2_slver_04020188;
+struct slver md5_ctx_mgr_flush_avx2_slver = { 0x0188, 0x02, 0x04 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c
new file mode 100644
index 000000000..a7f54c2b0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_avx512.c
@@ -0,0 +1,253 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_avx512(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_avx512(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_avx512(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (MD5_BLOCK_SIZE - 1);
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_avx512_slver_0600018c;
+struct slver md5_ctx_mgr_init_avx512_slver = { 0x018c, 0x00, 0x06 };
+
+struct slver md5_ctx_mgr_submit_avx512_slver_0600018d;
+struct slver md5_ctx_mgr_submit_avx512_slver = { 0x018d, 0x00, 0x06 };
+
+struct slver md5_ctx_mgr_flush_avx512_slver_0600018e;
+struct slver md5_ctx_mgr_flush_avx512_slver = { 0x018e, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c
new file mode 100644
index 000000000..8688dfc37
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ctx_sse.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define inline __inline
+#endif
+
+static inline void hash_init_digest(MD5_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len);
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx);
+
+void md5_ctx_mgr_init_sse(MD5_HASH_CTX_MGR * mgr)
+{
+ md5_mb_mgr_init_sse(&mgr->mgr);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_submit_sse(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < MD5_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = MD5_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= MD5_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= MD5_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return md5_ctx_mgr_resubmit(mgr, ctx);
+}
+
+MD5_HASH_CTX *md5_ctx_mgr_flush_sse(MD5_HASH_CTX_MGR * mgr)
+{
+ MD5_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = md5_ctx_mgr_resubmit(mgr, ctx);
+
+ // If md5_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static MD5_HASH_CTX *md5_ctx_mgr_resubmit(MD5_HASH_CTX_MGR * mgr, MD5_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (MD5_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ //memcpy(ctx->partial_block_buffer, ((const char*)buffer + len), copy_len);
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % MD5_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= MD5_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (MD5_HASH_CTX *) md5_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(MD5_WORD_T * digest)
+{
+ static const MD5_WORD_T hash_initial_digest[MD5_DIGEST_NWORDS] =
+ { MD5_INITIAL_DIGEST };
+ //memcpy(digest, hash_initial_digest, sizeof(hash_initial_digest));
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[MD5_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (MD5_BLOCK_SIZE - 1);
+
+ // memset(&padblock[i], 0, MD5_BLOCK_SIZE);
+ memclr_fixedlen(&padblock[i], MD5_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ i += ((MD5_BLOCK_SIZE - 1) & (0 - (total_len + MD5_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ MD5_PADLENGTHFIELD_SIZE;
+
+ *((uint64_t *) & padblock[i - 8]) = ((uint64_t) total_len << 3);
+
+ return i >> MD5_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver md5_ctx_mgr_init_sse_slver_00020180;
+struct slver md5_ctx_mgr_init_sse_slver = { 0x0180, 0x02, 0x00 };
+
+struct slver md5_ctx_mgr_submit_sse_slver_00020181;
+struct slver md5_ctx_mgr_submit_sse_slver = { 0x0181, 0x02, 0x00 };
+
+struct slver md5_ctx_mgr_flush_sse_slver_00020182;
+struct slver md5_ctx_mgr_flush_sse_slver = { 0x0182, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm
new file mode 100644
index 000000000..9f4c510c2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_job.asm
@@ -0,0 +1,55 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define JOB_MD5 structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; JOB_MD5
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 4, 4 ; length in bytes
+FIELD _result_digest, 4*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+%assign _JOB_MD5_size _FIELD_OFFSET
+%assign _JOB_MD5_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..63743cef5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_datastruct.asm
@@ -0,0 +1,73 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define MD5 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MD5_ARGS_X32
+;;; name size align
+FIELD _digest, 4*4*32, 16 ; transposed digest
+FIELD _data_ptr, 8*32, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _MD5_ARGS_X8_size _FIELD_OFFSET
+%assign _MD5_ARGS_X8_align _STRUCT_ALIGN
+%assign _MD5_ARGS_X16_size _FIELD_OFFSET
+%assign _MD5_ARGS_X16_align _STRUCT_ALIGN
+%assign _MD5_ARGS_X32_size _FIELD_OFFSET
+%assign _MD5_ARGS_X32_align _STRUCT_ALIGN
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _MD5_ARGS_X8_size, _MD5_ARGS_X8_align
+FIELD _lens, 4*32, 8
+FIELD _unused_lanes, 8*4, 8
+FIELD _ldata, _LANE_DATA_size*32, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..7b681136c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx.asm
@@ -0,0 +1,243 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_avx
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_avx
+%define idx r8
+
+%define unused_lanes r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_avx(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+global md5_mb_mgr_flush_avx:function
+md5_mb_mgr_flush_avx:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; if bit (32+3) is set, then all lanes are empty
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..ecc283193
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx2.asm
@@ -0,0 +1,251 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x8x2_avx2
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x8x2_avx2
+%define idx rbp
+
+%define unused_lanes r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%define num_lanes_inuse r8
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_avx2(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+global md5_mb_mgr_flush_avx2:function
+md5_mb_mgr_flush_avx2:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x8x2_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..e8d4ca03c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_avx512.asm
@@ -0,0 +1,313 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern md5_mb_x16x2_avx512
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x16_avx512
+%define idx rbp
+
+%define unused_lanes ymm7
+%define lane r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%define num_lanes_inuse r8
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+;; Byte shift in MEM addr, read a extra byte [addr+16]
+%macro MEM_VPSRLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr + 1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr + 31], byte 0
+%endmacro
+
+;; Byte shift in MEM addr, read a extra byte [addr-1]
+%macro MEM_VPSLLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr-1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr], byte 0
+%endmacro
+
+align 64
+default rel
+section .text
+
+; JOB* md5_mb_mgr_flush_avx512(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+global md5_mb_mgr_flush_avx512:function
+md5_mb_mgr_flush_avx512:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 31
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 32
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ ; Find min length
+ vmovdqu ymm5, [state + _lens + 2*32]
+ vmovdqu ymm6, [state + _lens + 3*32]
+
+ vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+ vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+
+ vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0x3F
+ shr len2, 6
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_6bits]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+ vpsubd ymm5, ymm5, ymm2
+ vpsubd ymm6, ymm6, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+ vmovdqu [state + _lens + 2*32], ymm5
+ vmovdqu [state + _lens + 3*32], ymm6
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x16x2_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov lane, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+
+ shl lane, 8
+ or lane, idx
+ MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes
+ mov [state + _unused_lanes], lane
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_6bits:
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+lane_16: dq 16
+lane_17: dq 17
+lane_18: dq 18
+lane_19: dq 19
+lane_20: dq 20
+lane_21: dq 21
+lane_22: dq 22
+lane_23: dq 23
+lane_24: dq 24
+lane_25: dq 25
+lane_26: dq 26
+lane_27: dq 27
+lane_28: dq 28
+lane_29: dq 29
+lane_30: dq 30
+lane_31: dq 31
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_mgr_flush_avx512
+no_md5_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..7ee81616f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_flush_sse.asm
@@ -0,0 +1,244 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_sse
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; UN*X register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_sse
+%define idx r8
+
+%define unused_lanes r9
+
+%define lane_data r10
+
+%define job_rax rax
+%define tmp rax
+
+%endif ;; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; JOB* md5_mb_mgr_flush_sse(MB_MGR_HMAC_OOO *state)
+; arg 1 : rcx : state
+global md5_mb_mgr_flush_sse:function
+md5_mb_mgr_flush_sse:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; if bit (32+3) is set, then all lanes are empty
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ movdqa xmm0, [state + _lens + 0*16]
+ movdqa xmm1, [state + _lens + 1*16]
+
+ movdqa xmm2, xmm0
+ pminud xmm2, xmm1 ; xmm2 has {D,C,B,A}
+ palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ pminud xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ pminud xmm2, xmm3 ; xmm2 has min value in low dword
+
+ movd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ pand xmm2, [rel clear_low_nibble]
+ pshufd xmm2, xmm2, 0
+
+ psubd xmm0, xmm2
+ psubd xmm1, xmm2
+
+ movdqa [state + _lens + 0*16], xmm0
+ movdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*32]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..b2e983362
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx2.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_avx2(MD5_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes[0] = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < 16; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..e83b2e38f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_avx512.c
@@ -0,0 +1,44 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_avx512(MD5_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes[0] = 0x0706050403020100;
+ state->unused_lanes[1] = 0x0f0e0d0c0b0a0908;
+ state->unused_lanes[2] = 0x1716151413121110;
+ state->unused_lanes[3] = 0x1f1e1d1c1b1a1918;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < 32; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c
new file mode 100644
index 000000000..049d2147d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_init_sse.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "md5_mb.h"
+
+void md5_mb_mgr_init_sse(MD5_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes[0] = 0xF76543210;
+ for (j = 0; j < 8; j++) {
+ state->lens[j] = 0xFFFFFFFF;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..5663942bf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx.asm
@@ -0,0 +1,222 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+default rel
+
+extern md5_mb_x4x2_avx
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_avx
+%define idx r8
+
+%define p r9
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane r10
+
+%define lane_data r11
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global md5_mb_mgr_submit_avx:function
+md5_mb_mgr_submit_avx:
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovd [state + _args_digest + 4*lane + 0*32], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..9279b855d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx2.asm
@@ -0,0 +1,235 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x8x2_avx2
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define lane rsi
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%define lane rdx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in a register not clobberred by md5_mb_x8x2_avx2
+%define idx rbp
+
+%define p r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define num_lanes_inuse r9
+
+%define lane_data r10
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global md5_mb_mgr_submit_avx2:function
+md5_mb_mgr_submit_avx2:
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovd [state + _args_digest + 4*lane + 0*64], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x8x2_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..40102ccce
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_avx512.asm
@@ -0,0 +1,280 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern md5_mb_x16x2_avx512
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define lane rsi
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%define lane rdx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx needs to be in a register not clobberred by md5_mb_x16_avx512
+%define idx rbp
+
+%define p r11
+
+%define unused_lanes ymm7
+
+%define job_rax rax
+%define len rax
+
+%define num_lanes_inuse r9
+
+%define lane_data r10
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+;; Byte shift in MEM addr, read a extra byte [addr+16]
+%macro MEM_VPSRLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr + 1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr + 31], byte 0
+%endmacro
+
+;; Byte shift in MEM addr, read a extra byte [addr-1]
+%macro MEM_VPSLLDDQ 2
+%define %%addr %1
+%define %%TMP_YMM %2
+ vmovdqu %%TMP_YMM, [%%addr-1]
+ vmovdqu [%%addr], %%TMP_YMM
+ mov [%%addr], byte 0
+%endmacro
+
+align 64
+default rel
+section .text
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global md5_mb_mgr_submit_avx512:function
+md5_mb_mgr_submit_avx512:
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov lane, [state + _unused_lanes]
+ and lane, 0x3F
+ MEM_VPSRLDDQ (state + _unused_lanes), unused_lanes
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov DWORD(len), [job + _len]
+
+ shl len, 6 ; low 5 bits store idx
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*16*2], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*16*2], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*16*2], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*16*2], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 32
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has halves of ymm2 reversed
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ ; Find min length
+ vmovdqu ymm5, [state + _lens + 2*32]
+ vmovdqu ymm6, [state + _lens + 3*32]
+
+ vpminud ymm4, ymm5, ymm6 ; ymm4 has {D,C,B,A}
+ vpalignr ymm3, ymm3, ymm4, 8 ; ymm3 has {x,x,D,C}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has {x,x,E,F}
+ vpalignr ymm3, ymm3, ymm4, 4 ; ymm3 has {x,x,x,E}
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+ vperm2i128 ymm3, ymm4, ymm4, 1 ; ymm3 has halves of ymm4 reversed
+ vpminud ymm4, ymm4, ymm3 ; ymm4 has min value in low dword
+
+ vpminud ymm2, ymm2, ymm4 ; ymm2 has min value in low dword
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0x3F
+ shr len2, 6
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_6bits]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+ vpsubd ymm5, ymm5, ymm2
+ vpsubd ymm6, ymm6, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+ vmovdqu [state + _lens + 2*32], ymm5
+ vmovdqu [state + _lens + 3*32], ymm6
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x16x2_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov lane, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+
+ shl lane, 8
+ or lane, idx
+ MEM_VPSLLDDQ (state + _unused_lanes), unused_lanes
+ mov [state + _unused_lanes], lane
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16*2]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16*2], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16*2], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16*2], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=32
+
+align 32
+clear_low_6bits:
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+ dq 0x00000000FFFFFFC0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_mgr_submit_avx512
+no_md5_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..e15118583
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_mgr_submit_sse.asm
@@ -0,0 +1,223 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_job.asm"
+%include "md5_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern md5_mb_x4x2_sse
+default rel
+
+%if 1
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%else
+; UN*X register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by md5_mb_x4x2_sse
+%define idx r8
+
+%define p r9
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane r10
+
+%define lane_data r11
+
+%endif ; if 1
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* submit_job(MB_MGR *state, JOB_MD5 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global md5_mb_mgr_submit_sse:function
+md5_mb_mgr_submit_sse:
+
+ sub rsp, STACK_SPACE
+ ; we need to save/restore all GPRs because lower layer clobbers them
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*1], rbp
+ mov [rsp + 8*2], r12
+ mov [rsp + 8*3], r13
+ mov [rsp + 8*4], r14
+ mov [rsp + 8*5], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*6], rsi
+ mov [rsp + 8*7], rdi
+ movdqa [rsp + 8*8 + 16*0], xmm6
+ movdqa [rsp + 8*8 + 16*1], xmm7
+ movdqa [rsp + 8*8 + 16*2], xmm8
+ movdqa [rsp + 8*8 + 16*3], xmm9
+ movdqa [rsp + 8*8 + 16*4], xmm10
+ movdqa [rsp + 8*8 + 16*5], xmm11
+ movdqa [rsp + 8*8 + 16*6], xmm12
+ movdqa [rsp + 8*8 + 16*7], xmm13
+ movdqa [rsp + 8*8 + 16*8], xmm14
+ movdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqu xmm0, [job + _result_digest + 0*16]
+ movd [state + _args_digest + 4*lane + 0*32], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ movdqa xmm0, [state + _lens + 0*16]
+ movdqa xmm1, [state + _lens + 1*16]
+
+ movdqa xmm2, xmm0
+ pminud xmm2, xmm1 ; xmm2 has {D,C,B,A}
+ palignr xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ pminud xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ palignr xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ pminud xmm2, xmm3 ; xmm2 has min value in low dword
+
+ movd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ pand xmm2, [rel clear_low_nibble]
+ pshufd xmm2, xmm2, 0
+
+ psubd xmm0, xmm2
+ psubd xmm1, xmm2
+
+ movdqa [state + _lens + 0*16], xmm0
+ movdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call md5_mb_x4x2_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov dword [state + _lens + 4*idx], 0xFFFFFFFF
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*32]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 8*8 + 16*0]
+ movdqa xmm7, [rsp + 8*8 + 16*1]
+ movdqa xmm8, [rsp + 8*8 + 16*2]
+ movdqa xmm9, [rsp + 8*8 + 16*3]
+ movdqa xmm10, [rsp + 8*8 + 16*4]
+ movdqa xmm11, [rsp + 8*8 + 16*5]
+ movdqa xmm12, [rsp + 8*8 + 16*6]
+ movdqa xmm13, [rsp + 8*8 + 16*7]
+ movdqa xmm14, [rsp + 8*8 + 16*8]
+ movdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*6]
+ mov rdi, [rsp + 8*7]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*1]
+ mov r12, [rsp + 8*2]
+ mov r13, [rsp + 8*3]
+ mov r14, [rsp + 8*4]
+ mov r15, [rsp + 8*5]
+
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c
new file mode 100644
index 000000000..5efeda710
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_ssl_test.c
@@ -0,0 +1,151 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/md5.h>
+#include "md5_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+
+ printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ MD5(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != ((uint32_t *) digest_ssl[i])[j]) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ ((uint32_t *) digest_ssl[i])[j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Ramdom buffer with ramdom len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ MD5(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ ((uint32_t *) digest_ssl[i])[j]) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ ((uint32_t *) digest_ssl[i])[j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c
new file mode 100644
index 000000000..451bcbc13
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_test.c
@@ -0,0 +1,196 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+
+ printf("multibinary_md5 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ md5_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ md5_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run md5_mb test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ md5_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ md5_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_md5 test
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c
new file mode 100644
index 000000000..4737a94bf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_rand_update_test.c
@@ -0,0 +1,291 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "md5_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*MD5_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*MD5_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][MD5_DIGEST_NWORDS];
+
+extern void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+
+ printf("multibinary_md5_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ md5_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_md5 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = md5_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = md5_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = md5_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ md5_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ md5_ctx_mgr_init(mgr);
+
+ // Run md5_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = MD5_BLOCK_SIZE +
+ MD5_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % MD5_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem, HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = md5_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = md5_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % MD5_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = md5_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c
new file mode 100644
index 000000000..bd1ad8e0c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_test.c
@@ -0,0 +1,223 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "md5_mb.h"
+
+typedef uint32_t DigestMD5[MD5_DIGEST_NWORDS];
+
+#define MSGS 13
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "Test vector from febooti.com";
+static uint8_t msg2[] = "12345678901234567890" "12345678901234567890"
+ "12345678901234567890" "12345678901234567890";
+static uint8_t msg3[] = "";
+static uint8_t msg4[] = "abcdefghijklmnopqrstuvwxyz";
+static uint8_t msg5[] = "message digest";
+static uint8_t msg6[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789";
+static uint8_t msg7[] = "abc";
+static uint8_t msg8[] = "a";
+
+static uint8_t msg9[] = "";
+static uint8_t msgA[] = "abcdefghijklmnopqrstuvwxyz";
+static uint8_t msgB[] = "message digest";
+static uint8_t msgC[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz0123456789";
+static uint8_t msgD[] = "abc";
+
+static DigestMD5 expResultDigest1 = { 0x61b60a50, 0xfbb76d3c, 0xf5620cd3, 0x0f3d57ff };
+static DigestMD5 expResultDigest2 = { 0xa2f4ed57, 0x55c9e32b, 0x2eda49ac, 0x7ab60721 };
+static DigestMD5 expResultDigest3 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec };
+static DigestMD5 expResultDigest4 = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca };
+static DigestMD5 expResultDigest5 = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa };
+static DigestMD5 expResultDigest6 = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f };
+static DigestMD5 expResultDigest7 = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 };
+static DigestMD5 expResultDigest8 = { 0xb975c10c, 0xa8b6f1c0, 0xe299c331, 0x61267769 };
+
+static DigestMD5 expResultDigest9 = { 0xd98c1dd4, 0x04b2008f, 0x980980e9, 0x7e42f8ec };
+static DigestMD5 expResultDigestA = { 0xd7d3fcc3, 0x00e49261, 0x6c49fb7d, 0x3be167ca };
+static DigestMD5 expResultDigestB = { 0x7d696bf9, 0x8d93b77c, 0x312f5a52, 0xd061f1aa };
+static DigestMD5 expResultDigestC = { 0x98ab74d1, 0xf5d977d2, 0x2c1c61a5, 0x9f9d419f };
+static DigestMD5 expResultDigestD = { 0x98500190, 0xb04fd23c, 0x7d3f96d6, 0x727fe128 };
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8, msg9,
+ msgA, msgB, msgC, msgD
+};
+
+static uint32_t *expResultDigest[MSGS] = {
+ expResultDigest1, expResultDigest2, expResultDigest3,
+ expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7, expResultDigest8, expResultDigest9,
+ expResultDigestA, expResultDigestB, expResultDigestC,
+ expResultDigestD
+};
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i], msgs[i],
+ strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = md5_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = md5_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = md5_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_md5 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..7e9acde28
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_vs_ossl_perf.c
@@ -0,0 +1,123 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/md5.h>
+#include "md5_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * MD5_DIGEST_NWORDS];
+
+int main(void)
+{
+ MD5_HASH_CTX_MGR *mgr = NULL;
+ MD5_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t) TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ posix_memalign((void *)&mgr, 16, sizeof(MD5_HASH_CTX_MGR));
+ md5_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ MD5(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("md5_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ md5_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (md5_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_md5" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < MD5_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != ((uint32_t *) digest_ssl[i])[j]) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ ((uint32_t *) digest_ssl[i])[j]);
+ }
+ }
+ }
+
+ printf("Multi-buffer md5 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_md5_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm
new file mode 100644
index 000000000..7ce641409
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x16x2_avx512.asm
@@ -0,0 +1,850 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+default rel
+
+;; code to compute double octal MD5 using AVX512
+
+;; Stack must be aligned to 64 bytes before call
+
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers zmm0-8, 14-31
+
+;; clobbers all GPRs other than arg1 and rbp
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) global func_name:function internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+;; These are pointers to data block1 and block2 in the stack
+; which will ping pong back and forth
+%define DPTR1 rbx
+%define DPTR2 var2
+%define IDX var1
+%define TBL rax
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+;; Transposed Digest Storage
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define A1 zmm4
+%define B1 zmm5
+%define C1 zmm6
+%define D1 zmm7
+
+%define md5c zmm16
+
+%define MASK0 zmm17
+%define MASK1 zmm18
+
+%define TMP0 zmm20
+%define TMP1 zmm21
+
+
+;; Data are stored into the Wx after transposition
+%define W0 zmm8
+%define W1 zmm9
+%define W2 zmm10
+%define W3 zmm11
+%define W4 zmm12
+%define W5 zmm13
+%define W6 zmm14
+%define W7 zmm15
+
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define MD5_DIGEST_ROW_SIZE (16*4)
+%define APPEND(a,b) a %+ b
+%define APPEND3(a,b,c) a %+ b %+ c
+
+;; Temporary registers used during data transposition
+
+%define RESZ resb 64*
+;; Assume stack aligned to 64 bytes before call
+;; Therefore FRAMESIZE mod 64 must be 64-8 = 56
+struc STACK
+_DATA: RESZ 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: RESZ 8 ; stores Z_AA-Z_DD, Z_AA2-Z_DD2
+_TMPDIGEST: RESZ 2 ; stores Z_AA, Z_BB temporarily
+_RSP_SAVE: RESQ 1 ; original RSP
+endstruc
+
+%define Z_AA rsp + _DIGEST + 64*0
+%define Z_BB rsp + _DIGEST + 64*1
+%define Z_CC rsp + _DIGEST + 64*2
+%define Z_DD rsp + _DIGEST + 64*3
+%define Z_AA1 rsp + _DIGEST + 64*4
+%define Z_BB1 rsp + _DIGEST + 64*5
+%define Z_CC1 rsp + _DIGEST + 64*6
+%define Z_DD1 rsp + _DIGEST + 64*7
+
+%define MD5_DIGEST_ROW_SIZE (32*4)
+
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+%define rot11 7
+%define rot12 12
+%define rot13 17
+%define rot14 22
+%define rot21 5
+%define rot22 9
+%define rot23 14
+%define rot24 20
+%define rot31 4
+%define rot32 11
+%define rot33 16
+%define rot34 23
+%define rot41 6
+%define rot42 10
+%define rot43 15
+%define rot44 21
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
+
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+ vmovdqa32 %%r14, MASK0
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, MASK1
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, MASK0
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, MASK1
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, MASK0
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, MASK1
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, MASK0
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, MASK1
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, MASK0
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, MASK1
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, MASK0
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, MASK1
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, MASK0
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, MASK1
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, MASK0
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, MASK1
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_ARGS1 0
+%xdefine TMP_ D1
+%xdefine D1 C1
+%xdefine C1 B1
+%xdefine B1 A1
+%xdefine A1 TMP_
+%endm
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +Ft(B,C,D) +data +const), nrot)
+;;eg: PROCESS_LOOP MD5constx, Mdatax, F_IMMEDx, NROTx
+%macro PROCESS_LOOP 6
+%define %%MD5const %1
+%define %%data %2
+%define %%F_IMMED %3
+%define %%NROT %4
+%define %%TMP_PR0 %5
+%define %%TMP_PR1 %6
+ ; a=b+((a+Ft(b,c,d)+Mj+ti)<<s)
+
+ ; Ft
+ ; 0-15 Ft:F(X,Y,Z)=(X&Y)|((~X)&Z) 0xca
+ ; 16-31 Ft:G(X,Y,Z)=(X&Z)|(Y&(~Z)) 0xe4
+ ; 32-47 Ft:H(X,Y,Z)=X^Y^Z 0x96
+ ; 48-63 Ft:I(X,Y,Z)=Y^(X|(~Z)) 0x39
+
+ vpaddd A, A, %%MD5const
+ vpaddd A1, A1, %%MD5const
+ vpaddd A, A, [%%data]
+ vpaddd A1, A1, [%%data + 16*64]
+ vmovdqa32 %%TMP_PR0, B ; Copy B
+ vmovdqa32 %%TMP_PR1, B1 ; Copy B
+ vpternlogd %%TMP_PR0, C, D, %%F_IMMED
+ vpternlogd %%TMP_PR1, C1, D1, %%F_IMMED
+ vpaddd A, A, %%TMP_PR0
+ vpaddd A1, A1, %%TMP_PR1
+ vprold A, A, %%NROT
+ vprold A1, A1, %%NROT
+ vpaddd A, A, B
+ vpaddd A1, A1, B1
+
+ ROTATE_ARGS
+ ROTATE_ARGS1
+%endmacro
+
+align 64
+default rel
+section .text
+
+; void md5_mb_x16x2_avx512(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+
+local_func_decl(md5_mb_x16x2_avx512)
+md5_mb_x16x2_avx512:
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -64
+ mov [rsp + _RSP_SAVE], rax
+
+ mov DPTR1, rsp
+ lea DPTR2, [rsp + 64*32]
+
+ ;; Load MD5 constant pointer to register
+ lea TBL, [MD5_TABLE]
+ vmovdqa32 MASK0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vmovdqa32 MASK1, [PSHUFFLE_TRANSPOSE16_MASK2]
+
+ ;; Preload input data from 16 segments.
+ xor IDX, IDX
+
+ ;; transpose input onto stack
+ ;; first 16 lanes read
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+ ;; first 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR1+_DATA+(0)*64],W0
+ vmovdqa32 [DPTR1+_DATA+(1)*64],W1
+ vmovdqa32 [DPTR1+_DATA+(2)*64],W2
+ vmovdqa32 [DPTR1+_DATA+(3)*64],W3
+ vmovdqa32 [DPTR1+_DATA+(4)*64],W4
+ vmovdqa32 [DPTR1+_DATA+(5)*64],W5
+ vmovdqa32 [DPTR1+_DATA+(6)*64],W6
+ vmovdqa32 [DPTR1+_DATA+(7)*64],W7
+ vmovdqa32 [DPTR1+_DATA+(8)*64],W8
+ vmovdqa32 [DPTR1+_DATA+(9)*64],W9
+ vmovdqa32 [DPTR1+_DATA+(10)*64],W10
+ vmovdqa32 [DPTR1+_DATA+(11)*64],W11
+ vmovdqa32 [DPTR1+_DATA+(12)*64],W12
+ vmovdqa32 [DPTR1+_DATA+(13)*64],W13
+ vmovdqa32 [DPTR1+_DATA+(14)*64],W14
+ vmovdqa32 [DPTR1+_DATA+(15)*64],W15
+
+ ;; second 16 lanes read
+ mov inp0, [IN + 16*8]
+ mov inp1, [IN + 17*8]
+ mov inp2, [IN + 18*8]
+ mov inp3, [IN + 19*8]
+ mov inp4, [IN + 20*8]
+ mov inp5, [IN + 21*8]
+ mov inp6, [IN + 22*8]
+ mov inp7, [IN + 23*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 24*8]
+ mov inp1, [IN + 25*8]
+ mov inp2, [IN + 26*8]
+ mov inp3, [IN + 27*8]
+ mov inp4, [IN + 28*8]
+ mov inp5, [IN + 29*8]
+ mov inp6, [IN + 30*8]
+ mov inp7, [IN + 31*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+ ;; second 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR1+_DATA+(16+0)*64],W0
+ vmovdqa32 [DPTR1+_DATA+(16+1)*64],W1
+ vmovdqa32 [DPTR1+_DATA+(16+2)*64],W2
+ vmovdqa32 [DPTR1+_DATA+(16+3)*64],W3
+ vmovdqa32 [DPTR1+_DATA+(16+4)*64],W4
+ vmovdqa32 [DPTR1+_DATA+(16+5)*64],W5
+ vmovdqa32 [DPTR1+_DATA+(16+6)*64],W6
+ vmovdqa32 [DPTR1+_DATA+(16+7)*64],W7
+ vmovdqa32 [DPTR1+_DATA+(16+8)*64],W8
+ vmovdqa32 [DPTR1+_DATA+(16+9)*64],W9
+ vmovdqa32 [DPTR1+_DATA+(16+10)*64],W10
+ vmovdqa32 [DPTR1+_DATA+(16+11)*64],W11
+ vmovdqa32 [DPTR1+_DATA+(16+12)*64],W12
+ vmovdqa32 [DPTR1+_DATA+(16+13)*64],W13
+ vmovdqa32 [DPTR1+_DATA+(16+14)*64],W14
+ vmovdqa32 [DPTR1+_DATA+(16+15)*64],W15
+
+ ;; Initialize digests
+ ;; vmovdqu32 replace vmovdqa32
+ vmovdqu32 A, [DIGEST + 0 * MD5_DIGEST_ROW_SIZE]
+ vmovdqu32 B, [DIGEST + 1 * MD5_DIGEST_ROW_SIZE]
+ vmovdqu32 C, [DIGEST + 2 * MD5_DIGEST_ROW_SIZE]
+ vmovdqu32 D, [DIGEST + 3 * MD5_DIGEST_ROW_SIZE]
+ ; Load the digest for each stream (9-16)
+ vmovdqu32 A1,[DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64]
+ vmovdqu32 B1,[DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64]
+ vmovdqu32 C1,[DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64]
+ vmovdqu32 D1,[DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64]
+
+.lloop:
+ ;; Increment IDX to point to next data block (64 bytes per block)
+ add IDX, 64
+
+ ; Save digests for later addition
+ vmovdqa32 [Z_AA], A
+ vmovdqa32 [Z_BB], B
+ vmovdqa32 [Z_CC], C
+ vmovdqa32 [Z_DD], D
+ vmovdqa32 [Z_AA1], A1
+ vmovdqa32 [Z_BB1], B1
+ vmovdqa32 [Z_CC1], C1
+ vmovdqa32 [Z_DD1], D1
+
+ sub SIZE, 1
+ je .LastLoop
+
+%assign I 0
+%assign I_fimm 0xCA
+%rep 16 ; 0<=I<=15
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data I
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+ ;; first 16 lanes read
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+
+%assign I 16
+%assign I_fimm 0xE4
+%rep 16 ; 16<=I<=31
+ %assign I_data ((5*I+1) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ;; first 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR2+_DATA+(0)*64],W0
+ vmovdqa32 [DPTR2+_DATA+(1)*64],W1
+ vmovdqa32 [DPTR2+_DATA+(2)*64],W2
+ vmovdqa32 [DPTR2+_DATA+(3)*64],W3
+ vmovdqa32 [DPTR2+_DATA+(4)*64],W4
+ vmovdqa32 [DPTR2+_DATA+(5)*64],W5
+ vmovdqa32 [DPTR2+_DATA+(6)*64],W6
+ vmovdqa32 [DPTR2+_DATA+(7)*64],W7
+ vmovdqa32 [DPTR2+_DATA+(8)*64],W8
+ vmovdqa32 [DPTR2+_DATA+(9)*64],W9
+ vmovdqa32 [DPTR2+_DATA+(10)*64],W10
+ vmovdqa32 [DPTR2+_DATA+(11)*64],W11
+ vmovdqa32 [DPTR2+_DATA+(12)*64],W12
+ vmovdqa32 [DPTR2+_DATA+(13)*64],W13
+ vmovdqa32 [DPTR2+_DATA+(14)*64],W14
+ vmovdqa32 [DPTR2+_DATA+(15)*64],W15
+
+%assign I 32
+%assign I_fimm 0x96
+%rep 16 ; 32<=I<=47
+ %assign I_data ((3*I+5) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ;; second 16 lanes read
+ mov inp0, [IN + 16*8]
+ mov inp1, [IN + 17*8]
+ mov inp2, [IN + 18*8]
+ mov inp3, [IN + 19*8]
+ mov inp4, [IN + 20*8]
+ mov inp5, [IN + 21*8]
+ mov inp6, [IN + 22*8]
+ mov inp7, [IN + 23*8]
+ vmovdqu32 W0,[inp0+IDX]
+ vmovdqu32 W1,[inp1+IDX]
+ vmovdqu32 W2,[inp2+IDX]
+ vmovdqu32 W3,[inp3+IDX]
+ vmovdqu32 W4,[inp4+IDX]
+ vmovdqu32 W5,[inp5+IDX]
+ vmovdqu32 W6,[inp6+IDX]
+ vmovdqu32 W7,[inp7+IDX]
+ mov inp0, [IN + 24*8]
+ mov inp1, [IN + 25*8]
+ mov inp2, [IN + 26*8]
+ mov inp3, [IN + 27*8]
+ mov inp4, [IN + 28*8]
+ mov inp5, [IN + 29*8]
+ mov inp6, [IN + 30*8]
+ mov inp7, [IN + 31*8]
+ vmovdqu32 W8, [inp0+IDX]
+ vmovdqu32 W9, [inp1+IDX]
+ vmovdqu32 W10,[inp2+IDX]
+ vmovdqu32 W11,[inp3+IDX]
+ vmovdqu32 W12,[inp4+IDX]
+ vmovdqu32 W13,[inp5+IDX]
+ vmovdqu32 W14,[inp6+IDX]
+ vmovdqu32 W15,[inp7+IDX]
+
+%assign I 48
+%assign I_fimm 0x39
+%rep 16 ; 48<=I<=63
+ %assign I_rotX (I/16+1)
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data ((7*I) % 16)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ;; second 16 lanes trans&write
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+ vmovdqa32 [DPTR2+_DATA+(16+0)*64],W0
+ vmovdqa32 [DPTR2+_DATA+(16+1)*64],W1
+ vmovdqa32 [DPTR2+_DATA+(16+2)*64],W2
+ vmovdqa32 [DPTR2+_DATA+(16+3)*64],W3
+ vmovdqa32 [DPTR2+_DATA+(16+4)*64],W4
+ vmovdqa32 [DPTR2+_DATA+(16+5)*64],W5
+ vmovdqa32 [DPTR2+_DATA+(16+6)*64],W6
+ vmovdqa32 [DPTR2+_DATA+(16+7)*64],W7
+ vmovdqa32 [DPTR2+_DATA+(16+8)*64],W8
+ vmovdqa32 [DPTR2+_DATA+(16+9)*64],W9
+ vmovdqa32 [DPTR2+_DATA+(16+10)*64],W10
+ vmovdqa32 [DPTR2+_DATA+(16+11)*64],W11
+ vmovdqa32 [DPTR2+_DATA+(16+12)*64],W12
+ vmovdqa32 [DPTR2+_DATA+(16+13)*64],W13
+ vmovdqa32 [DPTR2+_DATA+(16+14)*64],W14
+ vmovdqa32 [DPTR2+_DATA+(16+15)*64],W15
+
+ ; Add old digest
+ vpaddd A,A,[Z_AA]
+ vpaddd B,B,[Z_BB]
+ vpaddd C,C,[Z_CC]
+ vpaddd D,D,[Z_DD]
+ vpaddd A1,A1,[Z_AA1]
+ vpaddd B1,B1,[Z_BB1]
+ vpaddd C1,C1,[Z_CC1]
+ vpaddd D1,D1,[Z_DD1]
+
+ ; Swap DPTR1 and DPTR2
+ xchg DPTR1, DPTR2
+ ;; Proceed to processing of next block
+ jmp .lloop
+
+.LastLoop:
+%assign I 0
+%assign I_fimm 0xCA
+%rep 16 ; 0<=I<=15
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data I
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 16
+%assign I_fimm 0xE4
+%rep 16 ; 16<=I<=31
+ %assign I_data ((5*I+1) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 32
+%assign I_fimm 0x96
+%rep 16 ; 32<=I<=47
+ %assign I_data ((3*I+5) % 16)
+ %assign I_rotX I/16+1
+ %assign I_rotY (I % 4 + 1)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+%assign I 48
+%assign I_fimm 0x39
+%rep 16 ; 48<=I<=63
+ %assign I_rotX (I/16+1)
+ %assign I_rotY (I % 4 + 1)
+ %assign I_data ((7*I) % 16)
+ vpbroadcastd md5c, [TBL + I * 4]
+ PROCESS_LOOP md5c, DPTR1+ I_data*64, I_fimm, APPEND3(rot, I_rotX, I_rotY), TMP0, TMP1
+ %assign I (I+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,[Z_AA]
+ vpaddd B,B,[Z_BB]
+ vpaddd C,C,[Z_CC]
+ vpaddd D,D,[Z_DD]
+ vpaddd A1,A1,[Z_AA1]
+ vpaddd B1,B1,[Z_BB1]
+ vpaddd C1,C1,[Z_CC1]
+ vpaddd D1,D1,[Z_DD1]
+
+ ;; update into data pointers
+%assign I 0
+%rep 16
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ vmovdqu32 [DIGEST + 0*MD5_DIGEST_ROW_SIZE ], A
+ vmovdqu32 [DIGEST + 1*MD5_DIGEST_ROW_SIZE ], B
+ vmovdqu32 [DIGEST + 2*MD5_DIGEST_ROW_SIZE ], C
+ vmovdqu32 [DIGEST + 3*MD5_DIGEST_ROW_SIZE ], D
+ ; Store the digest for each stream (9-16)
+ vmovdqu32 [DIGEST + 0 * MD5_DIGEST_ROW_SIZE + 64], A1
+ vmovdqu32 [DIGEST + 1 * MD5_DIGEST_ROW_SIZE + 64], B1
+ vmovdqu32 [DIGEST + 2 * MD5_DIGEST_ROW_SIZE + 64], C1
+ vmovdqu32 [DIGEST + 3 * MD5_DIGEST_ROW_SIZE + 64], D1
+
+ mov rsp, [rsp + _RSP_SAVE]
+ ret
+
+section .data
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee
+ dd 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501
+ dd 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be
+ dd 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821
+ dd 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa
+ dd 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8
+ dd 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed
+ dd 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a
+ dd 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c
+ dd 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70
+ dd 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05
+ dd 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665
+ dd 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039
+ dd 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1
+ dd 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1
+ dd 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_md5_mb_x16x2_avx512
+no_md5_mb_x16x2_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm
new file mode 100644
index 000000000..1b4927909
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_avx.asm
@@ -0,0 +1,782 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+;; code to compute octal MD5 using AVX
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ ;movdqa %%F,%%Z
+ vpxor %%F,%%Z, %%Y
+ vpand %%F,%%F,%%X
+ vpxor %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ ;movdqa %%F,%%Z
+ vpxor %%F,%%Z, %%Y
+ vpxor %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ ;movdqa %%F,%%Z
+ vpxor %%F,%%Z,[ONES] ; pnot %%F
+ vpor %%F,%%F,%%X
+ vpxor %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ ;movdqa %%tmp, %%reg
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%data %12
+%define %%MD5const %13
+%define %%nrot %14
+
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, [%%data]
+ vpaddd %%A2, %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ vpaddd %%A, %%A, %%FUN
+ %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+ vpaddd %%A2, %%A2, %%FUN
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ vmovdqa %%TMP,[%%data]
+ vmovdqa %%TMP2,[%%data + 16*16]
+ vpaddd %%A, %%A, %%MD5const
+ vpaddd %%A2, %%A2, %%MD5const
+ vpaddd %%A, %%A, %%TMP
+ vpaddd %%A2, %%A2, %%TMP2
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+ vpaddd %%A, %%A, %%FUN
+ vpaddd %%A2, %%A2, %%FUN2
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP2
+ vpaddd %%A, %%A, %%B
+ vpaddd %%A2, %%A2, %%B2
+%endmacro
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4 ; tmp
+%define F xmm5 ; tmp
+
+%define A2 xmm6
+%define B2 xmm7
+%define C2 xmm8
+%define D2 xmm9
+
+
+%define FUN E
+%define TMP F
+%define FUN2 xmm10
+%define TMP2 xmm11
+
+%define T0 xmm10
+%define T1 xmm11
+%define T2 xmm12
+%define T3 xmm13
+%define T4 xmm14
+%define T5 xmm15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define inp7 rcx
+%define mem1 rdx
+%else
+;; Windows Registers
+%define arg1 rcx
+%define arg2 rdx
+%define inp7 rdi
+%define mem1 rsi
+%endif
+; r8 is not used
+
+; Common definitions
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define TBL rax
+%define IDX rbx
+%define mem2 rbp
+
+
+
+
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4 \
+; ... \
+; 300 data2[0] for lanes 7...4 \
+; 2F0 data2[15] for lanes 3...0 > mem block 2
+; ... /
+; 210 data2[1] for lanes 3...0 /
+; 200 data2[0] for lanes 3...0 /
+;
+; 1F0 data1[15] for lanes 7...4 \
+; ... \
+; 100 data1[0] for lanes 7...4 \
+; F0 data1[15] for lanes 3...0 > mem block 1
+; ... /
+; 10 data1[1] for lanes 3...0 /
+; 0 data1[0] for lanes 3...0 /
+
+MEM equ 16*16*2*2 ; two blocks of data stored in stack
+; STACK_SIZE must be an odd multiple of 8 bytes in size
+STACK_SIZE equ MEM + 16*8 + 8
+
+%define AA rsp + MEM + 16*0
+%define BB rsp + MEM + 16*1
+%define CC rsp + MEM + 16*2
+%define DD rsp + MEM + 16*3
+%define AA2 rsp + MEM + 16*4
+%define BB2 rsp + MEM + 16*5
+%define CC2 rsp + MEM + 16*6
+%define DD2 rsp + MEM + 16*7
+
+;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word
+
+;#define NUM_MD5_DIGEST_WORDS 4
+;#define NUM_LANES 8
+;#define MD5_BLOCK_SIZE 64
+;
+;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES];
+;
+;typedef struct {
+; DECLARE_ALIGNED(digest_array digest, 16);
+; UINT8* data_ptr[NUM_LANES];
+;} MD5_ARGS_X8;
+
+; void md5_mb_x4x2_avx(MD5_ARGS_X8 *args, UINT64 size)
+; arg 1 : pointer to MD5_ARGS_X8 structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+;
+; arg1 and r8 are maintained by this function
+;
+align 32
+global md5_mb_x4x2_avx:function internal
+md5_mb_x4x2_avx:
+ sub rsp, STACK_SIZE
+
+ ;; Initialize digests
+ vmovdqu A,[arg1+0*16]
+ vmovdqu B,[arg1+2*16]
+ vmovdqu C,[arg1+4*16]
+ vmovdqu D,[arg1+6*16]
+
+ vmovdqu A2,[arg1+1*16]
+ vmovdqu B2,[arg1+3*16]
+ vmovdqu C2,[arg1+5*16]
+ vmovdqu D2,[arg1+7*16]
+
+ lea TBL, [MD5_TABLE]
+
+ ;; load input pointers
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+ mov inp4,[arg1 + _data_ptr + 4*8]
+ mov inp5,[arg1 + _data_ptr + 5*8]
+ mov inp6,[arg1 + _data_ptr + 6*8]
+ mov inp7,[arg1 + _data_ptr + 7*8]
+
+ xor IDX, IDX
+
+ ; Make ping-pong pointers to the two memory blocks
+ mov mem1, rsp
+ lea mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16],T0
+ vmovdqa [mem1+(I*4+1)*16],T1
+ vmovdqa [mem1+(I*4+2)*16],T2
+ vmovdqa [mem1+(I*4+3)*16],T3
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem1+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem1+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem1+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+
+ ; save old digests
+ vmovdqa [AA], A
+ vmovdqa [BB], B
+ vmovdqa [CC], C
+ vmovdqa [DD], D
+ ; save old digests
+ vmovdqa [AA2], A2
+ vmovdqa [BB2], B2
+ vmovdqa [CC2], C2
+ vmovdqa [DD2], D2
+
+ add IDX, 4*16
+ sub arg2, 1
+ je lastblock
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+ 0*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 1*16, [TBL+ 1*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+ 2*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 3*16, [TBL+ 3*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+ 4*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 5*16, [TBL+ 5*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+ 6*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+ 8*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 9*16, [TBL+ 9*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+10*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+11*16, [TBL+11*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+12*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+13*16, [TBL+13*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+14*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+15*16, [TBL+15*16], rot14
+
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+16*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 6*16, [TBL+17*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+18*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 0*16, [TBL+19*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+20*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+10*16, [TBL+21*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+22*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 4*16, [TBL+23*16], rot24
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+24*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+14*16, [TBL+25*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+26*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 8*16, [TBL+27*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+28*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 2*16, [TBL+29*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+30*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+12*16, [TBL+31*16], rot24
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 5*16, [TBL+32*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 8*16, [TBL+33*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+11*16, [TBL+34*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+14*16, [TBL+35*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 1*16, [TBL+36*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 4*16, [TBL+37*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 7*16, [TBL+38*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+10*16, [TBL+39*16], rot34
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+13*16, [TBL+40*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 0*16, [TBL+41*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 3*16, [TBL+42*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 6*16, [TBL+43*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 9*16, [TBL+44*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+12*16, [TBL+45*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+15*16, [TBL+46*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 2*16, [TBL+47*16], rot34
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 0*16, [TBL+48*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 7*16, [TBL+49*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+14*16, [TBL+50*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 5*16, [TBL+51*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+12*16, [TBL+52*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+ 3*16, [TBL+53*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+10*16, [TBL+54*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 1*16, [TBL+55*16], rot44
+
+ vmovdqu T2,[inp0+IDX+I*16]
+ vmovdqu T1,[inp1+IDX+I*16]
+ vmovdqu T4,[inp2+IDX+I*16]
+ vmovdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16],T0
+ vmovdqa [mem2+(I*4+1)*16],T1
+ vmovdqa [mem2+(I*4+2)*16],T2
+ vmovdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 8*16, [TBL+56*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+15*16, [TBL+57*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 6*16, [TBL+58*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+13*16, [TBL+59*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1+ 4*16, [TBL+60*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1+11*16, [TBL+61*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1+ 2*16, [TBL+62*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1+ 9*16, [TBL+63*16], rot44
+
+ vmovdqu T2,[inp4+IDX+I*16]
+ vmovdqu T1,[inp5+IDX+I*16]
+ vmovdqu T4,[inp6+IDX+I*16]
+ vmovdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vmovdqa [mem2+(I*4+0)*16 + 16*16],T0
+ vmovdqa [mem2+(I*4+1)*16 + 16*16],T1
+ vmovdqa [mem2+(I*4+2)*16 + 16*16],T2
+ vmovdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; swap mem1 and mem2
+ xchg mem1, mem2
+
+ jmp lloop
+
+lastblock:
+
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+ 0*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+ 1*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+ 2*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+ 3*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+ 4*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+ 5*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+ 6*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+ 7*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+ 8*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+ 9*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+10*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+11*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+12*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+13*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+14*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+15*16], rot14
+
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+16*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+17*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+18*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+19*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+20*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+21*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+22*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+23*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+24*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+25*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+26*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+27*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+28*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+29*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+30*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+31*16], rot24
+
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+32*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+33*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+34*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+35*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+36*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+37*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+38*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+39*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+40*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+41*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+42*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+43*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+44*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+45*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+46*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+47*16], rot34
+
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 0*16, [TBL+48*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 7*16, [TBL+49*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+14*16, [TBL+50*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 5*16, [TBL+51*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+12*16, [TBL+52*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+ 3*16, [TBL+53*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+10*16, [TBL+54*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 1*16, [TBL+55*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 8*16, [TBL+56*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+15*16, [TBL+57*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 6*16, [TBL+58*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+13*16, [TBL+59*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1+ 4*16, [TBL+60*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1+11*16, [TBL+61*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1+ 2*16, [TBL+62*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1+ 9*16, [TBL+63*16], rot44
+
+ vpaddd A,A,[AA]
+ vpaddd B,B,[BB]
+ vpaddd C,C,[CC]
+ vpaddd D,D,[DD]
+
+ vpaddd A2,A2,[AA2]
+ vpaddd B2,B2,[BB2]
+ vpaddd C2,C2,[CC2]
+ vpaddd D2,D2,[DD2]
+
+ ; write out digests
+ vmovdqu [arg1+0*16], A
+ vmovdqu [arg1+2*16], B
+ vmovdqu [arg1+4*16], C
+ vmovdqu [arg1+6*16], D
+
+ vmovdqu [arg1+1*16], A2
+ vmovdqu [arg1+3*16], B2
+ vmovdqu [arg1+5*16], C2
+ vmovdqu [arg1+7*16], D2
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ mov [arg1 + _data_ptr + 1*8], inp1
+ mov [arg1 + _data_ptr + 2*8], inp2
+ mov [arg1 + _data_ptr + 3*8], inp3
+ mov [arg1 + _data_ptr + 4*8], inp4
+ mov [arg1 + _data_ptr + 5*8], inp5
+ mov [arg1 + _data_ptr + 6*8], inp6
+ mov [arg1 + _data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ add rsp, STACK_SIZE
+
+ ret
+
+section .data align=64
+
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+
+ONES:
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm
new file mode 100644
index 000000000..f3fc29eca
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x4x2_sse.asm
@@ -0,0 +1,778 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+; clobbers all XMM registers
+; clobbers all GPRs except arg1 and r8
+
+;; code to compute octal MD5 using SSE
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movdqa %%t0, %%r0
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movdqa %%t1, %%r2
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movdqa %%r1, %%t0
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movdqa %%r3, %%r0
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,%%Y
+ pand %%F,%%X
+ pxor %%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,%%Y
+ pxor %%F,%%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ movdqa %%F,%%Z
+ pxor %%F,[ONES] ; pnot %%F
+ por %%F,%%X
+ pxor %%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%tmp, (32-%%imm)
+ pslld %%reg, %%imm
+ por %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP1 MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, data, MD5const, nrot
+%macro MD5_STEP1 14
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%data %12
+%define %%MD5const %13
+%define %%nrot %14
+
+ paddd %%A, %%MD5const
+ paddd %%A2, %%MD5const
+ paddd %%A, [%%data]
+ paddd %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ paddd %%A, %%FUN
+ %%MAGIC_FUN %%FUN, %%B2,%%C2,%%D2
+ paddd %%A2, %%FUN
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP
+ paddd %%A, %%B
+ paddd %%A2, %%B2
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%A %2
+%define %%B %3
+%define %%C %4
+%define %%D %5
+%define %%A2 %6
+%define %%B2 %7
+%define %%C2 %8
+%define %%D2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ paddd %%A, %%MD5const
+ paddd %%A2, %%MD5const
+ paddd %%A, [%%data]
+ paddd %%A2, [%%data + 16*16]
+ %%MAGIC_FUN %%FUN, %%B,%%C,%%D
+ %%MAGIC_FUN %%FUN2, %%B2,%%C2,%%D2
+ paddd %%A, %%FUN
+ paddd %%A2, %%FUN2
+ PROLD %%A,%%nrot, %%TMP
+ PROLD %%A2,%%nrot, %%TMP2
+ paddd %%A, %%B
+ paddd %%A2, %%B2
+%endmacro
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4 ; tmp
+%define F xmm5 ; tmp
+
+%define A2 xmm6
+%define B2 xmm7
+%define C2 xmm8
+%define D2 xmm9
+
+
+%define FUN E
+%define TMP F
+%define FUN2 xmm10
+%define TMP2 xmm11
+
+%define T0 xmm10
+%define T1 xmm11
+%define T2 xmm12
+%define T3 xmm13
+%define T4 xmm14
+%define T5 xmm15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;; Linux Registers
+%define arg1 rdi
+%define arg2 rsi
+%define inp7 rcx
+%define mem1 rdx
+%else
+;; Windows Registers
+%define arg1 rcx
+%define arg2 rdx
+%define inp7 rdi
+%define mem1 rsi
+%endif
+; r8 is not used
+
+; Common definitions
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+
+%define TBL rax
+%define IDX rbx
+%define mem2 rbp
+
+
+; Stack Layout
+;
+; 470 DD2
+; 460 CC2
+; 450 BB2
+; 440 AA2
+; 430 DD
+; 420 CC
+; 410 BB
+; 400 AA
+;
+; 3F0 data2[15] for lanes 7...4 \
+; ... \
+; 300 data2[0] for lanes 7...4 \
+; 2F0 data2[15] for lanes 3...0 > mem block 2
+; ... /
+; 210 data2[1] for lanes 3...0 /
+; 200 data2[0] for lanes 3...0 /
+;
+; 1F0 data1[15] for lanes 7...4 \
+; ... \
+; 100 data1[0] for lanes 7...4 \
+; F0 data1[15] for lanes 3...0 > mem block 1
+; ... /
+; 10 data1[1] for lanes 3...0 /
+; 0 data1[0] for lanes 3...0 /
+
+MEM equ 16*16*2*2 ; two blocks of data stored in stack
+; STACK_SIZE must be an odd multiple of 8 bytes in size
+STACK_SIZE equ MEM + 16*8 + 8
+
+%define AA rsp + MEM + 16*0
+%define BB rsp + MEM + 16*1
+%define CC rsp + MEM + 16*2
+%define DD rsp + MEM + 16*3
+%define AA2 rsp + MEM + 16*4
+%define BB2 rsp + MEM + 16*5
+%define CC2 rsp + MEM + 16*6
+%define DD2 rsp + MEM + 16*7
+
+;;%define DIGEST_SIZE (8*4*4) ; 8 streams x 4 32bit words per digest x 4 bytes per word
+
+;#define NUM_MD5_DIGEST_WORDS 4
+;#define NUM_LANES 8
+;#define MD5_BLOCK_SIZE 64
+;
+;typedef UINT32 digest_array[NUM_MD5_DIGEST_WORDS][NUM_LANES];
+;
+;typedef struct {
+; DECLARE_ALIGNED(digest_array digest, 16);
+; UINT8* data_ptr[NUM_LANES];
+;} MD5_ARGS_X8;
+
+; void md5_mb_x4x2_sse(MD5_ARGS_X8 *args, UINT64 size)
+; arg 1 : pointer to MD5_ARGS_X8 structure
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+;
+; arg1 and r8 are maintained by this function
+;
+align 32
+global md5_mb_x4x2_sse:function internal
+md5_mb_x4x2_sse:
+ sub rsp, STACK_SIZE
+
+ ;; Initialize digests
+ movdqu A,[arg1+0*16]
+ movdqu B,[arg1+2*16]
+ movdqu C,[arg1+4*16]
+ movdqu D,[arg1+6*16]
+
+ ;; Initialize digests
+ movdqu A2,[arg1+1*16]
+ movdqu B2,[arg1+3*16]
+ movdqu C2,[arg1+5*16]
+ movdqu D2,[arg1+7*16]
+
+ lea TBL, [MD5_TABLE]
+
+ ;; load input pointers
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+ mov inp4,[arg1 + _data_ptr + 4*8]
+ mov inp5,[arg1 + _data_ptr + 5*8]
+ mov inp6,[arg1 + _data_ptr + 6*8]
+ mov inp7,[arg1 + _data_ptr + 7*8]
+ xor IDX, IDX
+
+ ; Make ping-pong pointers to the two memory blocks
+ mov mem1, rsp
+ lea mem2, [rsp + 16*16*2]
+
+
+;; Load first block of data and save back to stack
+%assign I 0
+%rep 4
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem1+(I*4+0)*16],T0
+ movdqa [mem1+(I*4+1)*16],T1
+ movdqa [mem1+(I*4+2)*16],T2
+ movdqa [mem1+(I*4+3)*16],T3
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem1+(I*4+0)*16 + 16*16],T0
+ movdqa [mem1+(I*4+1)*16 + 16*16],T1
+ movdqa [mem1+(I*4+2)*16 + 16*16],T2
+ movdqa [mem1+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+%endrep
+
+lloop:
+ ; save old digests
+ movdqa [AA], A
+ movdqa [BB], B
+ movdqa [CC], C
+ movdqa [DD], D
+ ; save old digests
+ movdqa [AA2], A2
+ movdqa [BB2], B2
+ movdqa [CC2], C2
+ movdqa [DD2], D2
+
+ add IDX, 4*16
+ sub arg2, 1
+ je lastblock
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 7*16, [TBL+ 7*16], rot14
+
+%assign I 0
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP1 MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP1 MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP1 MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP1 MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +15*16, [TBL+15*16], rot14
+
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 4*16, [TBL+23*16], rot24
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP1 MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP1 MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP1 MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP1 MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +12*16, [TBL+31*16], rot24
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +10*16, [TBL+39*16], rot34
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP1 MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP1 MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP1 MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP1 MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 2*16, [TBL+47*16], rot34
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 1*16, [TBL+55*16], rot44
+
+ movdqu T2,[inp0+IDX+I*16]
+ movdqu T1,[inp1+IDX+I*16]
+ movdqu T4,[inp2+IDX+I*16]
+ movdqu T3,[inp3+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16],T0
+ movdqa [mem2+(I*4+1)*16],T1
+ movdqa [mem2+(I*4+2)*16],T2
+ movdqa [mem2+(I*4+3)*16],T3
+
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP1 MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP1 MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP1 MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP1 MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, mem1 + 9*16, [TBL+63*16], rot44
+
+ movdqu T2,[inp4+IDX+I*16]
+ movdqu T1,[inp5+IDX+I*16]
+ movdqu T4,[inp6+IDX+I*16]
+ movdqu T3,[inp7+IDX+I*16]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ movdqa [mem2+(I*4+0)*16 + 16*16],T0
+ movdqa [mem2+(I*4+1)*16 + 16*16],T1
+ movdqa [mem2+(I*4+2)*16 + 16*16],T2
+ movdqa [mem2+(I*4+3)*16 + 16*16],T3
+%assign I (I+1)
+
+
+ paddd A,[AA]
+ paddd B,[BB]
+ paddd C,[CC]
+ paddd D,[DD]
+
+ paddd A2,[AA2]
+ paddd B2,[BB2]
+ paddd C2,[CC2]
+ paddd D2,[DD2]
+
+ ; swap mem1 and mem2
+ xchg mem1, mem2
+
+ jmp lloop
+
+lastblock:
+
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+ 0*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+ 1*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+ 2*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+ 3*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+ 4*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+ 5*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+ 6*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+ 7*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+ 8*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+ 9*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+10*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+11*16], rot14
+ MD5_STEP MAGIC_F, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+12*16], rot11
+ MD5_STEP MAGIC_F, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+13*16], rot12
+ MD5_STEP MAGIC_F, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+14*16], rot13
+ MD5_STEP MAGIC_F, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+15*16], rot14
+
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+16*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+17*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+18*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+19*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+20*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+21*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+22*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+23*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+24*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+25*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+26*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+27*16], rot24
+ MD5_STEP MAGIC_G, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+28*16], rot21
+ MD5_STEP MAGIC_G, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+29*16], rot22
+ MD5_STEP MAGIC_G, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+30*16], rot23
+ MD5_STEP MAGIC_G, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+31*16], rot24
+
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+32*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+33*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+34*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+35*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+36*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+37*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+38*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+39*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+40*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+41*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+42*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+43*16], rot34
+ MD5_STEP MAGIC_H, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+44*16], rot31
+ MD5_STEP MAGIC_H, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+45*16], rot32
+ MD5_STEP MAGIC_H, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+46*16], rot33
+ MD5_STEP MAGIC_H, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+47*16], rot34
+
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 0*16, [TBL+48*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 7*16, [TBL+49*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +14*16, [TBL+50*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 5*16, [TBL+51*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 +12*16, [TBL+52*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 + 3*16, [TBL+53*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 +10*16, [TBL+54*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 1*16, [TBL+55*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 8*16, [TBL+56*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +15*16, [TBL+57*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 6*16, [TBL+58*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 +13*16, [TBL+59*16], rot44
+ MD5_STEP MAGIC_I, A,B,C,D, A2,B2,C2,D2, FUN,TMP, FUN2,TMP2, mem1 + 4*16, [TBL+60*16], rot41
+ MD5_STEP MAGIC_I, D,A,B,C, D2,A2,B2,C2, FUN,TMP, FUN2,TMP2, mem1 +11*16, [TBL+61*16], rot42
+ MD5_STEP MAGIC_I, C,D,A,B, C2,D2,A2,B2, FUN,TMP, FUN2,TMP2, mem1 + 2*16, [TBL+62*16], rot43
+ MD5_STEP MAGIC_I, B,C,D,A, B2,C2,D2,A2, FUN,TMP, FUN2,TMP2, mem1 + 9*16, [TBL+63*16], rot44
+
+ paddd A,[AA]
+ paddd B,[BB]
+ paddd C,[CC]
+ paddd D,[DD]
+
+ paddd A2,[AA2]
+ paddd B2,[BB2]
+ paddd C2,[CC2]
+ paddd D2,[DD2]
+
+ ; write out digests
+ movdqu [arg1+0*16], A
+ movdqu [arg1+2*16], B
+ movdqu [arg1+4*16], C
+ movdqu [arg1+6*16], D
+ movdqu [arg1+1*16], A2
+ movdqu [arg1+3*16], B2
+ movdqu [arg1+5*16], C2
+ movdqu [arg1+7*16], D2
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ mov [arg1 + _data_ptr + 1*8], inp1
+ mov [arg1 + _data_ptr + 2*8], inp2
+ mov [arg1 + _data_ptr + 3*8], inp3
+ mov [arg1 + _data_ptr + 4*8], inp4
+ mov [arg1 + _data_ptr + 5*8], inp5
+ mov [arg1 + _data_ptr + 6*8], inp6
+ mov [arg1 + _data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ add rsp, STACK_SIZE
+
+ ret
+
+section .data align=64
+
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+
+ONES:
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm
new file mode 100644
index 000000000..818c0ebbf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_mb_x8x2_avx2.asm
@@ -0,0 +1,917 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "md5_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+;; code to compute double octal MD5 using AVX2
+
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx rsi rdi r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp
+;;
+;; clobbers ymm0-15
+
+;; clobbers all GPRs other than arg1 and rbp
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rdi
+ %define reg4 rsi
+%else
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%endif
+
+;; rbp is not clobbered
+
+%define state arg1
+%define num_blks arg2
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+%define inp4 r12
+%define inp5 r13
+%define inp6 r14
+%define inp7 r15
+
+;; These are pointers to data block1 and block2 in the stack
+; which will ping pong back and forth
+%define DPTR1 rbx
+%define DPTR2 reg3
+
+%define TBL rax
+%define IDX reg4
+
+;; Transposed Digest Storage
+%define Y_A ymm0
+%define Y_B ymm1
+%define Y_C ymm2
+%define Y_D ymm3
+%define Y_A2 ymm4
+%define Y_B2 ymm5
+%define Y_C2 ymm6
+%define Y_D2 ymm7
+
+;; Temp YMM registers corresponding to the Temp XMM registers
+;; used during the transposition of the digests
+%define Y_KTMP1 ymm12
+%define Y_KTMP2 ymm13
+;; Temporary registers used during MD5 round operations
+%define Y_FUN ymm8
+%define Y_TMP ymm9
+%define Y_FUN2 ymm10
+%define Y_TMP2 ymm11
+
+
+;; YMM registers used during data fetching.
+;; Data are stored into the stack after transposition
+%define Y_DAT0 ymm8
+%define Y_DAT1 ymm9
+%define Y_DAT2 ymm10
+%define Y_DAT3 ymm11
+%define Y_DAT4 ymm12
+%define Y_DAT5 ymm13
+%define Y_DAT6 ymm14
+%define Y_DAT7 ymm15
+
+;; Temporary registers used during data transposition
+%define Y_DTMP1 ymm0
+%define Y_DTMP2 ymm1
+
+
+%define RESY resb 32*
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESIZE mod 32 must be 32-8 = 24
+struc STACK
+_DATA: RESY 2*2*16 ; 2 blocks * 2 sets of lanes * 16 regs
+_DIGEST: RESY 8 ; stores Y_AA-Y_DD, Y_AA2-Y_DD2
+_TMPDIGEST: RESY 2 ; stores Y_AA, Y_BB temporarily
+_RSP_SAVE: RESQ 1 ; original RSP
+endstruc
+
+
+%define Y_AA rsp + _DIGEST + 32*0
+%define Y_BB rsp + _DIGEST + 32*1
+%define Y_CC rsp + _DIGEST + 32*2
+%define Y_DD rsp + _DIGEST + 32*3
+%define Y_AA2 rsp + _DIGEST + 32*4
+%define Y_BB2 rsp + _DIGEST + 32*5
+%define Y_CC2 rsp + _DIGEST + 32*6
+%define Y_DD2 rsp + _DIGEST + 32*7
+
+%define MD5_DIGEST_ROW_SIZE (16*4)
+
+;;
+;; MD5 left rotations (number of bits)
+;;
+rot11 equ 7
+rot12 equ 12
+rot13 equ 17
+rot14 equ 22
+rot21 equ 5
+rot22 equ 9
+rot23 equ 14
+rot24 equ 20
+rot31 equ 4
+rot32 equ 11
+rot33 equ 16
+rot34 equ 23
+rot41 equ 6
+rot42 equ 10
+rot43 equ 15
+rot44 equ 21
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+
+;;
+;; Magic functions defined in RFC 1321
+;;
+; macro MAGIC_F F,X,Y,Z ;; F = ((Z) ^ ((X) & ((Y) ^ (Z))))
+%macro MAGIC_F 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpand %%F,%%F,%%X
+ vpxor %%F,%%F,%%Z
+%endmacro
+
+; macro MAGIC_G F,X,Y,Z ;; F = F((Z),(X),(Y))
+%macro MAGIC_G 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ MAGIC_F %%F,%%Z,%%X,%%Y
+%endmacro
+
+; macro MAGIC_H F,X,Y,Z ;; F = ((X) ^ (Y) ^ (Z))
+%macro MAGIC_H 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z, %%Y
+ vpxor %%F,%%F, %%X
+%endmacro
+
+; macro MAGIC_I F,X,Y,Z ;; F = ((Y) ^ ((X) | ~(Z)))
+%macro MAGIC_I 4
+%define %%F %1
+%define %%X %2
+%define %%Y %3
+%define %%Z %4
+ vpxor %%F,%%Z,[ONES] ; pnot %%F
+ vpor %%F,%%F,%%X
+ vpxor %%F,%%F,%%Y
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+;;
+;; single MD5 step
+;;
+;; A = B +ROL32((A +MAGIC(B,C,D) +data +const), nrot)
+;;
+; macro MD5_STEP MAGIC_FUN, A,B,C,D, A2,B2,C3,D2, FUN, TMP, FUN2, TMP2, data,
+; MD5const, nrot
+%macro MD5_STEP 16
+%define %%MAGIC_FUN %1
+%define %%rA %2
+%define %%rB %3
+%define %%rC %4
+%define %%rD %5
+%define %%rA2 %6
+%define %%rB2 %7
+%define %%rC2 %8
+%define %%rD2 %9
+%define %%FUN %10
+%define %%TMP %11
+%define %%FUN2 %12
+%define %%TMP2 %13
+%define %%data %14
+%define %%MD5const %15
+%define %%nrot %16
+
+ vpaddd %%rA, %%rA, %%MD5const
+ vpaddd %%rA2, %%rA2, %%MD5const
+ vpaddd %%rA, %%rA, [%%data]
+ vpaddd %%rA2, %%rA2, [%%data + 16*32]
+ %%MAGIC_FUN %%FUN, %%rB,%%rC,%%rD
+ %%MAGIC_FUN %%FUN2, %%rB2,%%rC2,%%rD2
+ vpaddd %%rA, %%rA, %%FUN
+ vpaddd %%rA2, %%rA2, %%FUN2
+ PROLD %%rA,%%nrot, %%TMP
+ PROLD %%rA2,%%nrot, %%TMP2
+ vpaddd %%rA, %%rA, %%rB
+ vpaddd %%rA2, %%rA2, %%rB2
+%endmacro
+
+align 32
+
+; void md5_mb_x8x2_avx2(MD5_ARGS *args, UINT64 num_blks)
+; arg 1 : pointer to MD5_ARGS structure
+; arg 2 : number of blocks (>=1)
+
+global md5_mb_x8x2_avx2:function internal
+md5_mb_x8x2_avx2:
+ mov rax, rsp
+ sub rsp, STACK_size
+ and rsp, -32
+ mov [rsp + _RSP_SAVE], rax
+
+ mov DPTR1, rsp
+ lea DPTR2, [rsp + 32*32]
+
+ ;; Load MD5 constant pointer to register
+ lea TBL, [MD5_TABLE]
+
+ ; Initialize index for data retrieval
+ xor IDX, IDX
+
+ ;; Fetch Pointers to Data Stream 1 to 8
+ mov inp0,[state + _data_ptr + 0*8]
+ mov inp1,[state + _data_ptr + 1*8]
+ mov inp2,[state + _data_ptr + 2*8]
+ mov inp3,[state + _data_ptr + 3*8]
+ mov inp4,[state + _data_ptr + 4*8]
+ mov inp5,[state + _data_ptr + 5*8]
+ mov inp6,[state + _data_ptr + 6*8]
+ mov inp7,[state + _data_ptr + 7*8]
+
+%assign I 0
+%rep 2
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR1+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR1+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR1+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR1+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR1+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR1+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR1+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR1+_DATA+(I*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+
+ ;; Fetch Pointers to Data Stream 9 to 16
+ mov inp0,[state + _data_ptr + 8*8]
+ mov inp1,[state + _data_ptr + 9*8]
+ mov inp2,[state + _data_ptr + 10*8]
+ mov inp3,[state + _data_ptr + 11*8]
+ mov inp4,[state + _data_ptr + 12*8]
+ mov inp5,[state + _data_ptr + 13*8]
+ mov inp6,[state + _data_ptr + 14*8]
+ mov inp7,[state + _data_ptr + 15*8]
+
+%assign I 0
+%rep 2
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR1+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR1+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR1+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR1+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR1+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR1+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR1+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR1+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+%assign I (I+1)
+%endrep
+ ;; digests are already transposed
+ vmovdqu Y_A,[state + 0 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_B,[state + 1 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_C,[state + 2 * MD5_DIGEST_ROW_SIZE ]
+ vmovdqu Y_D,[state + 3 * MD5_DIGEST_ROW_SIZE ]
+
+ ; Load the digest for each stream (9-16)
+ vmovdqu Y_A2,[state + 0 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_B2,[state + 1 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_C2,[state + 2 * MD5_DIGEST_ROW_SIZE + 32]
+ vmovdqu Y_D2,[state + 3 * MD5_DIGEST_ROW_SIZE + 32]
+
+lloop:
+
+ ; save old digests to stack
+ vmovdqa [Y_AA], Y_A
+ vmovdqa [Y_BB], Y_B
+ vmovdqa [Y_CC], Y_C
+ vmovdqa [Y_DD], Y_D
+
+ vmovdqa [Y_AA2], Y_A2
+ vmovdqa [Y_BB2], Y_B2
+ vmovdqa [Y_CC2], Y_C2
+ vmovdqa [Y_DD2], Y_D2
+
+ ;; Increment IDX to point to next data block (64 bytes per block)
+ add IDX, 64
+
+ ;; Update size of remaining blocks to process
+ sub num_blks, 1
+ je lastblock
+
+ ; Perform the 64 rounds of processing ...
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+
+
+ ;; Fetch Pointers to Data Stream 1 to 8 ??
+ mov inp0,[state + _data_ptr + 0*8]
+ mov inp1,[state + _data_ptr + 1*8]
+ mov inp2,[state + _data_ptr + 2*8]
+ mov inp3,[state + _data_ptr + 3*8]
+ mov inp4,[state + _data_ptr + 4*8]
+ mov inp5,[state + _data_ptr + 5*8]
+ mov inp6,[state + _data_ptr + 6*8]
+ mov inp7,[state + _data_ptr + 7*8]
+
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+%assign I 0
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+%assign I (I+1)
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+(I*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+(I*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+(I*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+(I*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+(I*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+(I*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+(I*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+(I*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+
+ ;; Fetch Pointers to Data Stream 9 to 16
+ mov inp0,[state + _data_ptr + 8*8]
+ mov inp1,[state + _data_ptr + 9*8]
+ mov inp2,[state + _data_ptr + 10*8]
+ mov inp3,[state + _data_ptr + 11*8]
+ mov inp4,[state + _data_ptr + 12*8]
+ mov inp5,[state + _data_ptr + 13*8]
+ mov inp6,[state + _data_ptr + 14*8]
+ mov inp7,[state + _data_ptr + 15*8]
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+%assign I 0
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+%assign I (I+1)
+
+ ; Y_A and Y_B share the same registers with Y_DTMP1 and Y_DTMP2
+ ; Therefore we need to save these to stack and restore after transpose
+ vmovdqa [rsp + _TMPDIGEST + 0*32], Y_A
+ vmovdqa [rsp + _TMPDIGEST + 1*32], Y_B
+
+ vmovdqu Y_DAT0,[inp0+IDX+I*32]
+ vmovdqu Y_DAT1,[inp1+IDX+I*32]
+ vmovdqu Y_DAT2,[inp2+IDX+I*32]
+ vmovdqu Y_DAT3,[inp3+IDX+I*32]
+ vmovdqu Y_DAT4,[inp4+IDX+I*32]
+ vmovdqu Y_DAT5,[inp5+IDX+I*32]
+ vmovdqu Y_DAT6,[inp6+IDX+I*32]
+ vmovdqu Y_DAT7,[inp7+IDX+I*32]
+ TRANSPOSE8 Y_DAT0, Y_DAT1, Y_DAT2, Y_DAT3, Y_DAT4, Y_DAT5, Y_DAT6, Y_DAT7, Y_DTMP1, Y_DTMP2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+0)*32],Y_DAT0
+ vmovdqa [DPTR2+_DATA+((I+2)*8+1)*32],Y_DAT1
+ vmovdqa [DPTR2+_DATA+((I+2)*8+2)*32],Y_DAT2
+ vmovdqa [DPTR2+_DATA+((I+2)*8+3)*32],Y_DAT3
+ vmovdqa [DPTR2+_DATA+((I+2)*8+4)*32],Y_DAT4
+ vmovdqa [DPTR2+_DATA+((I+2)*8+5)*32],Y_DAT5
+ vmovdqa [DPTR2+_DATA+((I+2)*8+6)*32],Y_DAT6
+ vmovdqa [DPTR2+_DATA+((I+2)*8+7)*32],Y_DAT7
+
+ ; Restore Y_A and Y_B
+ vmovdqa Y_A, [rsp + _TMPDIGEST + 0*32]
+ vmovdqa Y_B, [rsp + _TMPDIGEST + 1*32]
+
+ ; Add results to old digest values
+
+ vpaddd Y_A,Y_A,[Y_AA]
+ vpaddd Y_B,Y_B,[Y_BB]
+ vpaddd Y_C,Y_C,[Y_CC]
+ vpaddd Y_D,Y_D,[Y_DD]
+
+ vpaddd Y_A2,Y_A2,[Y_AA2]
+ vpaddd Y_B2,Y_B2,[Y_BB2]
+ vpaddd Y_C2,Y_C2,[Y_CC2]
+ vpaddd Y_D2,Y_D2,[Y_DD2]
+
+ ; Swap DPTR1 and DPTR2
+ xchg DPTR1, DPTR2
+
+ ;; Proceed to processing of next block
+ jmp lloop
+
+lastblock:
+
+ ; Perform the 64 rounds of processing ...
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+ 0*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+ 1*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+ 2*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+ 3*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+ 4*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+ 5*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+ 6*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+ 7*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+ 8*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+ 9*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+10*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+11*32], rot14
+ MD5_STEP MAGIC_F, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+12*32], rot11
+ MD5_STEP MAGIC_F, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+13*32], rot12
+ MD5_STEP MAGIC_F, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+14*32], rot13
+ MD5_STEP MAGIC_F, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+15*32], rot14
+
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+16*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+17*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+18*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+19*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+20*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+21*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+22*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+23*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+24*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+25*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+26*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+27*32], rot24
+ MD5_STEP MAGIC_G, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+28*32], rot21
+ MD5_STEP MAGIC_G, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+29*32], rot22
+ MD5_STEP MAGIC_G, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+30*32], rot23
+ MD5_STEP MAGIC_G, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+31*32], rot24
+
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+32*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+33*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+34*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+35*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+36*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+37*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+38*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+39*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+40*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+41*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+42*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+43*32], rot34
+ MD5_STEP MAGIC_H, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+44*32], rot31
+ MD5_STEP MAGIC_H, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+45*32], rot32
+ MD5_STEP MAGIC_H, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+46*32], rot33
+ MD5_STEP MAGIC_H, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+47*32], rot34
+
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 0*32, [TBL+48*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 7*32, [TBL+49*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+14*32, [TBL+50*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 5*32, [TBL+51*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+12*32, [TBL+52*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 3*32, [TBL+53*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+10*32, [TBL+54*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 1*32, [TBL+55*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 8*32, [TBL+56*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+15*32, [TBL+57*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 6*32, [TBL+58*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+13*32, [TBL+59*32], rot44
+ MD5_STEP MAGIC_I, Y_A,Y_B,Y_C,Y_D, Y_A2,Y_B2,Y_C2,Y_D2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 4*32, [TBL+60*32], rot41
+ MD5_STEP MAGIC_I, Y_D,Y_A,Y_B,Y_C, Y_D2,Y_A2,Y_B2,Y_C2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+11*32, [TBL+61*32], rot42
+ MD5_STEP MAGIC_I, Y_C,Y_D,Y_A,Y_B, Y_C2,Y_D2,Y_A2,Y_B2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 2*32, [TBL+62*32], rot43
+ MD5_STEP MAGIC_I, Y_B,Y_C,Y_D,Y_A, Y_B2,Y_C2,Y_D2,Y_A2, Y_FUN,Y_TMP, Y_FUN2,Y_TMP2, DPTR1+ 9*32, [TBL+63*32], rot44
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [state + _data_ptr + (2*I)*8]
+ mov inp1, [state + _data_ptr + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [state + _data_ptr + (2*I)*8], inp0
+ mov [state + _data_ptr + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ vpaddd Y_A,Y_A,[Y_AA]
+ vpaddd Y_B,Y_B,[Y_BB]
+ vpaddd Y_C,Y_C,[Y_CC]
+ vpaddd Y_D,Y_D,[Y_DD]
+
+ vpaddd Y_A2,Y_A2,[Y_AA2]
+ vpaddd Y_B2,Y_B2,[Y_BB2]
+ vpaddd Y_C2,Y_C2,[Y_CC2]
+ vpaddd Y_D2,Y_D2,[Y_DD2]
+
+
+
+ vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE ],Y_A
+ vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE ],Y_B
+ vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE ],Y_C
+ vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE ],Y_D
+
+
+ vmovdqu [state + 0*MD5_DIGEST_ROW_SIZE + 32 ],Y_A2 ;; 32 is YMM width
+ vmovdqu [state + 1*MD5_DIGEST_ROW_SIZE + 32 ],Y_B2
+ vmovdqu [state + 2*MD5_DIGEST_ROW_SIZE + 32 ],Y_C2
+ vmovdqu [state + 3*MD5_DIGEST_ROW_SIZE + 32 ],Y_D2
+
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+
+
+ mov rsp, [rsp + _RSP_SAVE]
+
+ ret
+
+section .data
+align 64
+MD5_TABLE:
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xd76aa478, 0xd76aa478, 0xd76aa478, 0xd76aa478
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0xe8c7b756, 0xe8c7b756, 0xe8c7b756, 0xe8c7b756
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0x242070db, 0x242070db, 0x242070db, 0x242070db
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xc1bdceee, 0xc1bdceee, 0xc1bdceee, 0xc1bdceee
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0xf57c0faf, 0xf57c0faf, 0xf57c0faf, 0xf57c0faf
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0x4787c62a, 0x4787c62a, 0x4787c62a, 0x4787c62a
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xa8304613, 0xa8304613, 0xa8304613, 0xa8304613
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0xfd469501, 0xfd469501, 0xfd469501, 0xfd469501
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x698098d8, 0x698098d8, 0x698098d8, 0x698098d8
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0x8b44f7af, 0x8b44f7af, 0x8b44f7af, 0x8b44f7af
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0xffff5bb1, 0xffff5bb1, 0xffff5bb1, 0xffff5bb1
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x895cd7be, 0x895cd7be, 0x895cd7be, 0x895cd7be
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0x6b901122, 0x6b901122, 0x6b901122, 0x6b901122
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xfd987193, 0xfd987193, 0xfd987193, 0xfd987193
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0xa679438e, 0xa679438e, 0xa679438e, 0xa679438e
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0x49b40821, 0x49b40821, 0x49b40821, 0x49b40821
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xf61e2562, 0xf61e2562, 0xf61e2562, 0xf61e2562
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0xc040b340, 0xc040b340, 0xc040b340, 0xc040b340
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0x265e5a51, 0x265e5a51, 0x265e5a51, 0x265e5a51
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa, 0xe9b6c7aa
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0xd62f105d, 0xd62f105d, 0xd62f105d, 0xd62f105d
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0x02441453, 0x02441453, 0x02441453, 0x02441453
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xd8a1e681, 0xd8a1e681, 0xd8a1e681, 0xd8a1e681
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8, 0xe7d3fbc8
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0x21e1cde6, 0x21e1cde6, 0x21e1cde6, 0x21e1cde6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xc33707d6, 0xc33707d6, 0xc33707d6, 0xc33707d6
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0xf4d50d87, 0xf4d50d87, 0xf4d50d87, 0xf4d50d87
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0x455a14ed, 0x455a14ed, 0x455a14ed, 0x455a14ed
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xa9e3e905, 0xa9e3e905, 0xa9e3e905, 0xa9e3e905
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8, 0xfcefa3f8
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x676f02d9, 0x676f02d9, 0x676f02d9, 0x676f02d9
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a, 0x8d2a4c8a
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0xfffa3942, 0xfffa3942, 0xfffa3942, 0xfffa3942
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x8771f681, 0x8771f681, 0x8771f681, 0x8771f681
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0x6d9d6122, 0x6d9d6122, 0x6d9d6122, 0x6d9d6122
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xfde5380c, 0xfde5380c, 0xfde5380c, 0xfde5380c
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0xa4beea44, 0xa4beea44, 0xa4beea44, 0xa4beea44
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9, 0x4bdecfa9
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60, 0xf6bb4b60
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0xbebfbc70, 0xbebfbc70, 0xbebfbc70, 0xbebfbc70
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0x289b7ec6, 0x289b7ec6, 0x289b7ec6, 0x289b7ec6
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xeaa127fa, 0xeaa127fa, 0xeaa127fa, 0xeaa127fa
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0xd4ef3085, 0xd4ef3085, 0xd4ef3085, 0xd4ef3085
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0x04881d05, 0x04881d05, 0x04881d05, 0x04881d05
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xd9d4d039, 0xd9d4d039, 0xd9d4d039, 0xd9d4d039
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0xe6db99e5, 0xe6db99e5, 0xe6db99e5, 0xe6db99e5
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8, 0x1fa27cf8
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xc4ac5665, 0xc4ac5665, 0xc4ac5665, 0xc4ac5665
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0xf4292244, 0xf4292244, 0xf4292244, 0xf4292244
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0x432aff97, 0x432aff97, 0x432aff97, 0x432aff97
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xab9423a7, 0xab9423a7, 0xab9423a7, 0xab9423a7
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0xfc93a039, 0xfc93a039, 0xfc93a039, 0xfc93a039
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x655b59c3, 0x655b59c3, 0x655b59c3, 0x655b59c3
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92, 0x8f0ccc92
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0xffeff47d, 0xffeff47d, 0xffeff47d, 0xffeff47d
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x85845dd1, 0x85845dd1, 0x85845dd1, 0x85845dd1
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f, 0x6fa87e4f
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0, 0xfe2ce6e0
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0xa3014314, 0xa3014314, 0xa3014314, 0xa3014314
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0x4e0811a1, 0x4e0811a1, 0x4e0811a1, 0x4e0811a1
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xf7537e82, 0xf7537e82, 0xf7537e82, 0xf7537e82
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0xbd3af235, 0xbd3af235, 0xbd3af235, 0xbd3af235
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb, 0x2ad7d2bb
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+ dd 0xeb86d391, 0xeb86d391, 0xeb86d391, 0xeb86d391
+ONES: dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+ dd 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm
new file mode 100644
index 000000000..2c8212855
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_multibinary.asm
@@ -0,0 +1,83 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern md5_ctx_mgr_init_sse
+extern md5_ctx_mgr_submit_sse
+extern md5_ctx_mgr_flush_sse
+
+extern md5_ctx_mgr_init_avx
+extern md5_ctx_mgr_submit_avx
+extern md5_ctx_mgr_flush_avx
+
+extern md5_ctx_mgr_init_avx2
+extern md5_ctx_mgr_submit_avx2
+extern md5_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern md5_ctx_mgr_init_avx512
+ extern md5_ctx_mgr_submit_avx512
+ extern md5_ctx_mgr_flush_avx512
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface md5_ctx_mgr_init
+mbin_interface md5_ctx_mgr_submit
+mbin_interface md5_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6 through replacing base by sse version
+ mbin_dispatch_init6 md5_ctx_mgr_init, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2, md5_ctx_mgr_init_avx512
+ mbin_dispatch_init6 md5_ctx_mgr_submit, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2, md5_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 md5_ctx_mgr_flush, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2, md5_ctx_mgr_flush_avx512
+%else
+ mbin_dispatch_init md5_ctx_mgr_init, md5_ctx_mgr_init_sse, md5_ctx_mgr_init_avx, md5_ctx_mgr_init_avx2
+ mbin_dispatch_init md5_ctx_mgr_submit, md5_ctx_mgr_submit_sse, md5_ctx_mgr_submit_avx, md5_ctx_mgr_submit_avx2
+ mbin_dispatch_init md5_ctx_mgr_flush, md5_ctx_mgr_flush_sse, md5_ctx_mgr_flush_avx, md5_ctx_mgr_flush_avx2
+%endif
+
+;; func core, ver, snum
+slversion md5_ctx_mgr_init, 00, 03, 0189
+slversion md5_ctx_mgr_submit, 00, 03, 018a
+slversion md5_ctx_mgr_flush, 00, 03, 018b
diff --git a/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c
new file mode 100644
index 000000000..9cb1fd646
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/md5_mb/md5_ref.c
@@ -0,0 +1,193 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference MD5 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+void md5_single(const uint8_t * data, uint32_t digest[4]);
+
+#define H0 0x67452301
+#define H1 0xefcdab89
+#define H2 0x98badcfe
+#define H3 0x10325476
+
+void md5_ref(uint8_t * input_data, uint32_t * digest, uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[128];
+ union {
+ uint64_t uint;
+ uint8_t uchar[8];
+ } convert;
+ uint8_t *p;
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+
+ i = len;
+ while (i >= 64) {
+ md5_single(input_data, digest);
+ input_data += 64;
+ i -= 64;
+ }
+ // 0 <= i < 64
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < 120; j++)
+ buf[j] = 0;
+
+ if (i > 64 - 8)
+ i = 128;
+ else
+ i = 64;
+
+ convert.uint = 8 * len;
+ p = buf + i - 8;
+ p[7] = convert.uchar[7];
+ p[6] = convert.uchar[6];
+ p[5] = convert.uchar[5];
+ p[4] = convert.uchar[4];
+ p[3] = convert.uchar[3];
+ p[2] = convert.uchar[2];
+ p[1] = convert.uchar[1];
+ p[0] = convert.uchar[0];
+
+ md5_single(buf, digest);
+ if (i == 128)
+ md5_single(buf + 64, digest);
+}
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (c ^ (d & (b ^ c)))
+#define F3(b,c,d) (b ^ c ^ d)
+#define F4(b,c,d) (c ^ (b | ~d))
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define step(i,a,b,c,d,f,k,w,r) \
+ if (i < 16) {f = F1(b,c,d); } else \
+ if (i < 32) {f = F2(b,c,d); } else \
+ if (i < 48) {f = F3(b,c,d); } else \
+ {f = F4(b,c,d); } \
+ f = a + f + k + w; \
+ a = b + rol32(f, r);
+
+void md5_single(const uint8_t * data, uint32_t digest[4])
+{
+ uint32_t a, b, c, d;
+ uint32_t f;
+ uint32_t *w = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+
+ step(0, a, b, c, d, f, 0xd76aa478, w[0], 7);
+ step(1, d, a, b, c, f, 0xe8c7b756, w[1], 12);
+ step(2, c, d, a, b, f, 0x242070db, w[2], 17);
+ step(3, b, c, d, a, f, 0xc1bdceee, w[3], 22);
+ step(4, a, b, c, d, f, 0xf57c0faf, w[4], 7);
+ step(5, d, a, b, c, f, 0x4787c62a, w[5], 12);
+ step(6, c, d, a, b, f, 0xa8304613, w[6], 17);
+ step(7, b, c, d, a, f, 0xfd469501, w[7], 22);
+ step(8, a, b, c, d, f, 0x698098d8, w[8], 7);
+ step(9, d, a, b, c, f, 0x8b44f7af, w[9], 12);
+ step(10, c, d, a, b, f, 0xffff5bb1, w[10], 17);
+ step(11, b, c, d, a, f, 0x895cd7be, w[11], 22);
+ step(12, a, b, c, d, f, 0x6b901122, w[12], 7);
+ step(13, d, a, b, c, f, 0xfd987193, w[13], 12);
+ step(14, c, d, a, b, f, 0xa679438e, w[14], 17);
+ step(15, b, c, d, a, f, 0x49b40821, w[15], 22);
+
+ step(16, a, b, c, d, f, 0xf61e2562, w[1], 5);
+ step(17, d, a, b, c, f, 0xc040b340, w[6], 9);
+ step(18, c, d, a, b, f, 0x265e5a51, w[11], 14);
+ step(19, b, c, d, a, f, 0xe9b6c7aa, w[0], 20);
+ step(20, a, b, c, d, f, 0xd62f105d, w[5], 5);
+ step(21, d, a, b, c, f, 0x02441453, w[10], 9);
+ step(22, c, d, a, b, f, 0xd8a1e681, w[15], 14);
+ step(23, b, c, d, a, f, 0xe7d3fbc8, w[4], 20);
+ step(24, a, b, c, d, f, 0x21e1cde6, w[9], 5);
+ step(25, d, a, b, c, f, 0xc33707d6, w[14], 9);
+ step(26, c, d, a, b, f, 0xf4d50d87, w[3], 14);
+ step(27, b, c, d, a, f, 0x455a14ed, w[8], 20);
+ step(28, a, b, c, d, f, 0xa9e3e905, w[13], 5);
+ step(29, d, a, b, c, f, 0xfcefa3f8, w[2], 9);
+ step(30, c, d, a, b, f, 0x676f02d9, w[7], 14);
+ step(31, b, c, d, a, f, 0x8d2a4c8a, w[12], 20);
+
+ step(32, a, b, c, d, f, 0xfffa3942, w[5], 4);
+ step(33, d, a, b, c, f, 0x8771f681, w[8], 11);
+ step(34, c, d, a, b, f, 0x6d9d6122, w[11], 16);
+ step(35, b, c, d, a, f, 0xfde5380c, w[14], 23);
+ step(36, a, b, c, d, f, 0xa4beea44, w[1], 4);
+ step(37, d, a, b, c, f, 0x4bdecfa9, w[4], 11);
+ step(38, c, d, a, b, f, 0xf6bb4b60, w[7], 16);
+ step(39, b, c, d, a, f, 0xbebfbc70, w[10], 23);
+ step(40, a, b, c, d, f, 0x289b7ec6, w[13], 4);
+ step(41, d, a, b, c, f, 0xeaa127fa, w[0], 11);
+ step(42, c, d, a, b, f, 0xd4ef3085, w[3], 16);
+ step(43, b, c, d, a, f, 0x04881d05, w[6], 23);
+ step(44, a, b, c, d, f, 0xd9d4d039, w[9], 4);
+ step(45, d, a, b, c, f, 0xe6db99e5, w[12], 11);
+ step(46, c, d, a, b, f, 0x1fa27cf8, w[15], 16);
+ step(47, b, c, d, a, f, 0xc4ac5665, w[2], 23);
+
+ step(48, a, b, c, d, f, 0xf4292244, w[0], 6);
+ step(49, d, a, b, c, f, 0x432aff97, w[7], 10);
+ step(50, c, d, a, b, f, 0xab9423a7, w[14], 15);
+ step(51, b, c, d, a, f, 0xfc93a039, w[5], 21);
+ step(52, a, b, c, d, f, 0x655b59c3, w[12], 6);
+ step(53, d, a, b, c, f, 0x8f0ccc92, w[3], 10);
+ step(54, c, d, a, b, f, 0xffeff47d, w[10], 15);
+ step(55, b, c, d, a, f, 0x85845dd1, w[1], 21);
+ step(56, a, b, c, d, f, 0x6fa87e4f, w[8], 6);
+ step(57, d, a, b, c, f, 0xfe2ce6e0, w[15], 10);
+ step(58, c, d, a, b, f, 0xa3014314, w[6], 15);
+ step(59, b, c, d, a, f, 0x4e0811a1, w[13], 21);
+ step(60, a, b, c, d, f, 0xf7537e82, w[4], 6);
+ step(61, d, a, b, c, f, 0xbd3af235, w[11], 10);
+ step(62, c, d, a, b, f, 0x2ad7d2bb, w[2], 15);
+ step(63, b, c, d, a, f, 0xeb86d391, w[9], 21);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am
new file mode 100644
index 000000000..1ec5f35c5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/Makefile.am
@@ -0,0 +1,67 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc_sha1 = mh_sha1/sha1_for_mh_sha1.c
+
+lsrc_mh_sha1 = mh_sha1/mh_sha1.c \
+ mh_sha1/mh_sha1_block_sse.asm \
+ mh_sha1/mh_sha1_block_avx.asm \
+ mh_sha1/mh_sha1_block_avx2.asm \
+ mh_sha1/mh_sha1_multibinary.asm \
+ mh_sha1/mh_sha1_finalize_base.c \
+ mh_sha1/mh_sha1_update_base.c \
+ mh_sha1/mh_sha1_block_base.c
+
+lsrc_mh_sha1 += mh_sha1/mh_sha1_block_avx512.asm \
+ mh_sha1/mh_sha1_avx512.c
+
+lsrc += $(lsrc_sha1) \
+ $(lsrc_mh_sha1)
+
+other_src += mh_sha1/mh_sha1_ref.c \
+ include/reg_sizes.asm \
+ include/multibinary.asm \
+ include/test.h \
+ mh_sha1/mh_sha1_internal.h
+
+extern_hdrs += include/mh_sha1.h
+
+check_tests += mh_sha1/mh_sha1_test
+unit_tests += mh_sha1/mh_sha1_update_test
+
+perf_tests += mh_sha1/mh_sha1_perf
+
+
+mh_sha1_test: mh_sha1_ref.o
+mh_sha1_mh_sha1_test_LDADD = mh_sha1/mh_sha1_ref.lo libisal_crypto.la
+
+mh_sha1_update_test: mh_sha1_ref.o
+mh_sha1_mh_sha1_update_test_LDADD = mh_sha1/mh_sha1_ref.lo libisal_crypto.la
+
+mh_sha1_mh_sha1_perf_LDADD = libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c
new file mode 100644
index 000000000..6cb458fc8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1.c
@@ -0,0 +1,137 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+int mh_sha1_init(struct mh_sha1_ctx *ctx)
+{
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint32_t i;
+
+ if (ctx == NULL)
+ return MH_SHA1_CTX_ERROR_NULL;
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha1_segs_digests[0][i] = MH_SHA1_H0;
+ mh_sha1_segs_digests[1][i] = MH_SHA1_H1;
+ mh_sha1_segs_digests[2][i] = MH_SHA1_H2;
+ mh_sha1_segs_digests[3][i] = MH_SHA1_H3;
+ mh_sha1_segs_digests[4][i] = MH_SHA1_H4;
+ }
+
+ return MH_SHA1_CTX_ERROR_NONE;
+}
+
+/***************mh_sha1_update***********/
+// mh_sha1_update_sse.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_sse
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_sse
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_update_avx.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_update_avx2.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx2
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx2
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+
+// mh_sha1_finalize_sse.c and mh_sha1_tail_sse.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_sse
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_sse
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_sse
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_finalize_avx.c and mh_sha1_tail_avx.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_avx
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+// mh_sha1_finalize_avx2.c and mh_sha1_tail_avx2.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_avx2
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx2
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx2
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+// Version info
+struct slver mh_sha1_init_slver_00000271;
+struct slver mh_sha1_init_slver = { 0x0271, 0x00, 0x00 };
+
+// mh_sha1_update version info
+struct slver mh_sha1_update_sse_slver_00000274;
+struct slver mh_sha1_update_sse_slver = { 0x0274, 0x00, 0x00 };
+
+struct slver mh_sha1_update_avx_slver_02000276;
+struct slver mh_sha1_update_avx_slver = { 0x0276, 0x00, 0x02 };
+
+struct slver mh_sha1_update_avx2_slver_04000278;
+struct slver mh_sha1_update_avx2_slver = { 0x0278, 0x00, 0x04 };
+
+// mh_sha1_finalize version info
+struct slver mh_sha1_finalize_sse_slver_00000275;
+struct slver mh_sha1_finalize_sse_slver = { 0x0275, 0x00, 0x00 };
+
+struct slver mh_sha1_finalize_avx_slver_02000277;
+struct slver mh_sha1_finalize_avx_slver = { 0x0277, 0x00, 0x02 };
+
+struct slver mh_sha1_finalize_avx2_slver_04000279;
+struct slver mh_sha1_finalize_avx2_slver = { 0x0279, 0x00, 0x04 };
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c
new file mode 100644
index 000000000..15f0ae1e2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_avx512.c
@@ -0,0 +1,71 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+/***************mh_sha1_update***********/
+// mh_sha1_update_avx512.c
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_avx512
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx512
+#include "mh_sha1_update_base.c"
+#undef MH_SHA1_UPDATE_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************mh_sha1_finalize AND mh_sha1_tail***********/
+// mh_sha1_tail is used to calculate the last incomplete src data block
+// mh_sha1_finalize is a mh_sha1_ctx wrapper of mh_sha1_tail
+// mh_sha1_finalize_avx512.c and mh_sha1_tail_avx512.c
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_avx512
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx512
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_avx512
+#include "mh_sha1_finalize_base.c"
+#undef MH_SHA1_FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+#undef MH_SHA1_BLOCK_FUNCTION
+
+/***************version info***********/
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// mh_sha1_update version info
+struct slver mh_sha1_update_avx512_slver_0600027c;
+struct slver mh_sha1_update_avx512_slver = { 0x027c, 0x00, 0x06 };
+
+// mh_sha1_finalize version info
+struct slver mh_sha1_finalize_avx512_slver_0600027d;
+struct slver mh_sha1_finalize_avx512_slver = { 0x027d, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
+
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm
new file mode 100644
index 000000000..8b6031997
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx.asm
@@ -0,0 +1,502 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX
+;;
+
+%include "reg_sizes.asm"
+default rel
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-(%%imm))
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-(%%imm))
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[%%data + (%%memW * 16)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+
+;void mh_sha1_block_avx(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+global mh_sha1_block_avx:function internal
+func(mh_sha1_block_avx)
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by avx
+ and rsp, ~0x0F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 5
+ VMOVPS A, [mh_digests_p + I*64 + 16*0]
+ VMOVPS B, [mh_digests_p + I*64 + 16*1]
+ VMOVPS C, [mh_digests_p + I*64 + 16*2]
+ VMOVPS D, [mh_digests_p + I*64 + 16*3]
+
+ vmovdqa [rsp + I*64 + 16*0], A
+ vmovdqa [rsp + I*64 + 16*1], B
+ vmovdqa [rsp + I*64 + 16*2], C
+ vmovdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+ VMOVPS T0,[mh_in_p + I*64+0*16]
+ VMOVPS T1,[mh_in_p + I*64+1*16]
+ VMOVPS T2,[mh_in_p + I*64+2*16]
+ VMOVPS T3,[mh_in_p + I*64+3*16]
+
+ vpshufb T0, F
+ vmovdqa [mh_data_p +(I)*16 +0*256],T0
+ vpshufb T1, F
+ vmovdqa [mh_data_p +(I)*16 +1*256],T1
+ vpshufb T2, F
+ vmovdqa [mh_data_p +(I)*16 +2*256],T2
+ vpshufb T3, F
+ vmovdqa [mh_data_p +(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ ;; Initialize digests
+ vmovdqa A, [rsp + 0*64 + mh_segs]
+ vmovdqa B, [rsp + 1*64 + mh_segs]
+ vmovdqa C, [rsp + 2*64 + mh_segs]
+ vmovdqa D, [rsp + 3*64 + mh_segs]
+ vmovdqa E, [rsp + 4*64 + mh_segs]
+
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+ vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ vpaddd A, AA
+ vpaddd B, BB
+ vpaddd C, CC
+ vpaddd D, DD
+ vpaddd E, EE
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], A
+ vmovdqa [rsp + 1*64 + mh_segs], B
+ vmovdqa [rsp + 2*64 + mh_segs], C
+ vmovdqa [rsp + 3*64 + mh_segs], D
+ vmovdqa [rsp + 4*64 + mh_segs], E
+
+ add pref, 256
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 5
+ vmovdqa A, [rsp + I*64 + 16*0]
+ vmovdqa B, [rsp + I*64 + 16*1]
+ vmovdqa C, [rsp + I*64 + 16*2]
+ vmovdqa D, [rsp + I*64 + 16*3]
+
+ VMOVPS [mh_digests_p + I*64 + 16*0], A
+ VMOVPS [mh_digests_p + I*64 + 16*1], B
+ VMOVPS [mh_digests_p + I*64 + 16*2], C
+ VMOVPS [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm
new file mode 100644
index 000000000..77aeda0d7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx2.asm
@@ -0,0 +1,509 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX-2
+;;
+
+%include "reg_sizes.asm"
+default rel
+
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-%%imm)
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[%%data + (%%memW * 32)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+
+%define F ymm5
+%define T0 ymm6
+%define T1 ymm7
+%define T2 ymm8
+%define T3 ymm9
+%define T4 ymm10
+%define T5 ymm11
+%define T6 ymm12
+%define T7 ymm13
+%define T8 ymm14
+%define T9 ymm15
+
+%define AA ymm5
+%define BB ymm6
+%define CC ymm7
+%define DD ymm8
+%define EE ymm9
+%define TMP ymm10
+%define FUN ymm11
+%define K ymm12
+%define W14 ymm13
+%define W15 ymm14
+%define W16 ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+
+;void mh_sha1_block_avx2(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+global mh_sha1_block_avx2:function internal
+func(mh_sha1_block_avx2)
+ FUNC_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 32 Bytes needed by avx2
+ and rsp, ~0x1F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 2
+ VMOVPS A, [mh_digests_p + I*32*5 + 32*0]
+ VMOVPS B, [mh_digests_p + I*32*5 + 32*1]
+ VMOVPS C, [mh_digests_p + I*32*5 + 32*2]
+ VMOVPS D, [mh_digests_p + I*32*5 + 32*3]
+ VMOVPS E, [mh_digests_p + I*32*5 + 32*4]
+
+ vmovdqa [rsp + I*32*5 + 32*0], A
+ vmovdqa [rsp + I*32*5 + 32*1], B
+ vmovdqa [rsp + I*32*5 + 32*2], C
+ vmovdqa [rsp + I*32*5 + 32*3], D
+ vmovdqa [rsp + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
+%assign I 0
+%rep 16
+ VMOVPS T0,[mh_in_p + I*64+0*32]
+ VMOVPS T1,[mh_in_p + I*64+1*32]
+
+ vpshufb T0, T0, F
+ vmovdqa [mh_data_p +I*32+0*512],T0
+ vpshufb T1, T1, F
+ vmovdqa [mh_data_p +I*32+1*512],T1
+%assign I (I+1)
+%endrep
+
+ mov mh_segs, 0 ;start from the first 8 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ ;; Initialize digests
+ vmovdqa A, [rsp + 0*64 + mh_segs]
+ vmovdqa B, [rsp + 1*64 + mh_segs]
+ vmovdqa C, [rsp + 2*64 + mh_segs]
+ vmovdqa D, [rsp + 3*64 + mh_segs]
+ vmovdqa E, [rsp + 4*64 + mh_segs]
+
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 32]
+ vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 32]
+ %rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+ PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*2]
+ PREFETCH_X [mh_in_p + pref+128*3]
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ vpaddd A,A, AA
+ vpaddd B,B, BB
+ vpaddd C,C, CC
+ vpaddd D,D, DD
+ vpaddd E,E, EE
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], A
+ vmovdqa [rsp + 1*64 + mh_segs], B
+ vmovdqa [rsp + 2*64 + mh_segs], C
+ vmovdqa [rsp + 3*64 + mh_segs], D
+ vmovdqa [rsp + 4*64 + mh_segs], E
+
+ add pref, 512
+
+ add mh_data_p, 512
+ add mh_segs, 32
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 2
+ vmovdqa A, [rsp + I*32*5 + 32*0]
+ vmovdqa B, [rsp + I*32*5 + 32*1]
+ vmovdqa C, [rsp + I*32*5 + 32*2]
+ vmovdqa D, [rsp + I*32*5 + 32*3]
+ vmovdqa E, [rsp + I*32*5 + 32*4]
+
+ VMOVPS [mh_digests_p + I*32*5 + 32*0], A
+ VMOVPS [mh_digests_p + I*32*5 + 32*1], B
+ VMOVPS [mh_digests_p + I*32*5 + 32*2], C
+ VMOVPS [mh_digests_p + I*32*5 + 32*3], D
+ VMOVPS [mh_digests_p + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=32
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm
new file mode 100644
index 000000000..3738c6d40
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_avx512.asm
@@ -0,0 +1,403 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+default rel
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovdqu64
+;SIMD variables definition
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define HH0 zmm5
+%define HH1 zmm6
+%define HH2 zmm7
+%define HH3 zmm8
+%define HH4 zmm9
+%define KT zmm10
+%define XTMP0 zmm11
+%define XTMP1 zmm12
+%define SHUF_MASK zmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;using extra 16 ZMM registers to place the inverse input data
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;macros definition
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ vmovdqa32 XTMP1, B ; Copy B
+ vpaddd E, E, %%WT ; E = E + Wt
+ vpternlogd XTMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ vprold XTMP0, A, 5 ; TMP0 = ROTL_5(A)
+ vpaddd E, E, XTMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ vprold B, B, 30 ; B = ROTL_30(B)
+ vpaddd E, E, XTMP0 ; E = T
+
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp2 %2
+%define %%WTp8 %3
+%define %%WTp13 %4
+ ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+ ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+ vpternlogd %%WT, %%WTp2, %%WTp8, 0x96
+ vpxord %%WT, %%WT, %%WTp13
+ vprold %%WT, %%WT, 1
+%endmacro
+
+%define APPEND(a,b) a %+ b
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ ; remove unwind info macros
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ movdqa [rsp + 0*16], xmm6
+ movdqa [rsp + 1*16], xmm7
+ movdqa [rsp + 2*16], xmm8
+ movdqa [rsp + 3*16], xmm9
+ movdqa [rsp + 4*16], xmm10
+ movdqa [rsp + 5*16], xmm11
+ movdqa [rsp + 6*16], xmm12
+ movdqa [rsp + 7*16], xmm13
+ movdqa [rsp + 8*16], xmm14
+ movdqa [rsp + 9*16], xmm15
+ mov [rsp + 10*16 + 0*8], r12
+ mov [rsp + 10*16 + 1*8], r13
+ mov [rsp + 10*16 + 2*8], r14
+ mov [rsp + 10*16 + 3*8], r15
+ mov [rsp + 10*16 + 4*8], rdi
+ mov [rsp + 10*16 + 5*8], rsi
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha1_block_avx512(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+global mh_sha1_block_avx512
+func(mh_sha1_block_avx512)
+ FUNC_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; align rsp to 64 Bytes needed by avx512
+ and rsp, ~0x3f
+
+ ; copy segs_digests into registers.
+ VMOVPS HH0, [mh_digests_p + 64*0]
+ VMOVPS HH1, [mh_digests_p + 64*1]
+ VMOVPS HH2, [mh_digests_p + 64*2]
+ VMOVPS HH3, [mh_digests_p + 64*3]
+ VMOVPS HH4, [mh_digests_p + 64*4]
+ ;a mask used to transform to big-endian data
+ vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ ;using extra 16 ZMM registers instead of stack
+%assign I 0
+%rep 8
+%assign J (I+1)
+ VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64]
+ VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64]
+
+ vpshufb APPEND(W,I), APPEND(W,I), SHUF_MASK
+ vpshufb APPEND(W,J), APPEND(W,J), SHUF_MASK
+%assign I (I+2)
+%endrep
+
+ vmovdqa64 A, HH0
+ vmovdqa64 B, HH1
+ vmovdqa64 C, HH2
+ vmovdqa64 D, HH3
+ vmovdqa64 E, HH4
+
+ vmovdqa32 KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 80
+ PROCESS_LOOP APPEND(W,J), I
+ %if N < 64
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %endif
+ %if N = 19
+ vmovdqa32 KT, [K20_39]
+ %assign I 0x96
+ %elif N = 39
+ vmovdqa32 KT, [K40_59]
+ %assign I 0xE8
+ %elif N = 59
+ vmovdqa32 KT, [K60_79]
+ %assign I 0x96
+ %endif
+ %if N % 20 = 19
+ PREFETCH_X [mh_in_p + 1024+128*(N / 20)]
+ PREFETCH_X [mh_in_p + 1024+128*(N / 20 +1)]
+ %endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+ ; Add old digest
+ vpaddd HH0,A, HH0
+ vpaddd HH1,B, HH1
+ vpaddd HH2,C, HH2
+ vpaddd HH3,D, HH3
+ vpaddd HH4,E, HH4
+
+ add mh_in_p, 1024
+ sub loops, 1
+ jne .block_loop
+
+ ; copy segs_digests to mh_digests_p
+ VMOVPS [mh_digests_p + 64*0], HH0
+ VMOVPS [mh_digests_p + 64*1], HH1
+ VMOVPS [mh_digests_p + 64*2], HH2
+ VMOVPS [mh_digests_p + 64*3], HH3
+ VMOVPS [mh_digests_p + 64*4], HH4
+
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+
+section .data align=64
+
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+
+K20_39: dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+
+K40_59: dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+
+K60_79: dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_mh_sha1_block_avx512
+no_mh_sha1_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c
new file mode 100644
index 000000000..cdee69a4a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_base.c
@@ -0,0 +1,387 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha1_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Base multi-hash SHA1 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+#define store_w(s, i, w, ww) (w[i][s] = bswap(ww[i*HASH_SEGS+s])) // only used for step 0 ~ 15
+#define update_w(s, i, w) (w[i&15][s] = rol32(w[(i-3)&15][s]^w[(i-8)&15][s]^w[(i-14)&15][s]^w[(i-16)&15][s], 1)) // used for step > 15
+#define update_e_1(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F1(b[s],c[s],d[s]) + K_00_19 + w[i&15][s])
+#define update_e_2(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F2(b[s],c[s],d[s]) + K_20_39 + w[i&15][s])
+#define update_e_3(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F3(b[s],c[s],d[s]) + K_40_59 + w[i&15][s])
+#define update_e_4(s, a, b, c, d, e, i, w) (e[s] += rol32(a[s],5) + F4(b[s],c[s],d[s]) + K_60_79 + w[i&15][s])
+#define update_b(s, b) (b[s] = rol32(b[s],30))
+
+#define STORE_W(i, w, ww) \
+ store_w(0, i, w, ww); \
+ store_w(1, i, w, ww); \
+ store_w(2, i, w, ww); \
+ store_w(3, i, w, ww); \
+ store_w(4, i, w, ww); \
+ store_w(5, i, w, ww); \
+ store_w(6, i, w, ww); \
+ store_w(7, i, w, ww); \
+ store_w(8, i, w, ww); \
+ store_w(9, i, w, ww); \
+ store_w(10, i, w, ww); \
+ store_w(11, i, w, ww); \
+ store_w(12, i, w, ww); \
+ store_w(13, i, w, ww); \
+ store_w(14, i, w, ww); \
+ store_w(15, i, w, ww)
+
+#define UPDATE_W(i, w) \
+ update_w(0, i, w); \
+ update_w(1, i, w); \
+ update_w(2, i, w); \
+ update_w(3, i, w); \
+ update_w(4, i, w); \
+ update_w(5, i, w); \
+ update_w(6, i, w); \
+ update_w(7, i, w); \
+ update_w(8, i, w); \
+ update_w(9, i, w); \
+ update_w(10, i, w); \
+ update_w(11, i, w); \
+ update_w(12, i, w); \
+ update_w(13, i, w); \
+ update_w(14, i, w); \
+ update_w(15, i, w)
+
+#define UPDATE_E1(a, b, c, d, e, i, w) \
+ update_e_1(0, a, b, c, d, e, i, w); \
+ update_e_1(1, a, b, c, d, e, i, w); \
+ update_e_1(2, a, b, c, d, e, i, w); \
+ update_e_1(3, a, b, c, d, e, i, w); \
+ update_e_1(4, a, b, c, d, e, i, w); \
+ update_e_1(5, a, b, c, d, e, i, w); \
+ update_e_1(6, a, b, c, d, e, i, w); \
+ update_e_1(7, a, b, c, d, e, i, w); \
+ update_e_1(8, a, b, c, d, e, i, w); \
+ update_e_1(9, a, b, c, d, e, i, w); \
+ update_e_1(10, a, b, c, d, e, i, w); \
+ update_e_1(11, a, b, c, d, e, i, w); \
+ update_e_1(12, a, b, c, d, e, i, w); \
+ update_e_1(13, a, b, c, d, e, i, w); \
+ update_e_1(14, a, b, c, d, e, i, w); \
+ update_e_1(15, a, b, c, d, e, i, w)
+
+#define UPDATE_E2(a, b, c, d, e, i, w) \
+ update_e_2(0, a, b, c, d, e, i, w); \
+ update_e_2(1, a, b, c, d, e, i, w); \
+ update_e_2(2, a, b, c, d, e, i, w); \
+ update_e_2(3, a, b, c, d, e, i, w); \
+ update_e_2(4, a, b, c, d, e, i, w); \
+ update_e_2(5, a, b, c, d, e, i, w); \
+ update_e_2(6, a, b, c, d, e, i, w); \
+ update_e_2(7, a, b, c, d, e, i, w); \
+ update_e_2(8, a, b, c, d, e, i, w); \
+ update_e_2(9, a, b, c, d, e, i, w); \
+ update_e_2(10, a, b, c, d, e, i, w); \
+ update_e_2(11, a, b, c, d, e, i, w); \
+ update_e_2(12, a, b, c, d, e, i, w); \
+ update_e_2(13, a, b, c, d, e, i, w); \
+ update_e_2(14, a, b, c, d, e, i, w); \
+ update_e_2(15, a, b, c, d, e, i, w)
+
+#define UPDATE_E3(a, b, c, d, e, i, w) \
+ update_e_3(0, a, b, c, d, e, i, w); \
+ update_e_3(1, a, b, c, d, e, i, w); \
+ update_e_3(2, a, b, c, d, e, i, w); \
+ update_e_3(3, a, b, c, d, e, i, w); \
+ update_e_3(4, a, b, c, d, e, i, w); \
+ update_e_3(5, a, b, c, d, e, i, w); \
+ update_e_3(6, a, b, c, d, e, i, w); \
+ update_e_3(7, a, b, c, d, e, i, w); \
+ update_e_3(8, a, b, c, d, e, i, w); \
+ update_e_3(9, a, b, c, d, e, i, w); \
+ update_e_3(10, a, b, c, d, e, i, w); \
+ update_e_3(11, a, b, c, d, e, i, w); \
+ update_e_3(12, a, b, c, d, e, i, w); \
+ update_e_3(13, a, b, c, d, e, i, w); \
+ update_e_3(14, a, b, c, d, e, i, w); \
+ update_e_3(15, a, b, c, d, e, i, w)
+
+#define UPDATE_E4(a, b, c, d, e, i, w) \
+ update_e_4(0, a, b, c, d, e, i, w); \
+ update_e_4(1, a, b, c, d, e, i, w); \
+ update_e_4(2, a, b, c, d, e, i, w); \
+ update_e_4(3, a, b, c, d, e, i, w); \
+ update_e_4(4, a, b, c, d, e, i, w); \
+ update_e_4(5, a, b, c, d, e, i, w); \
+ update_e_4(6, a, b, c, d, e, i, w); \
+ update_e_4(7, a, b, c, d, e, i, w); \
+ update_e_4(8, a, b, c, d, e, i, w); \
+ update_e_4(9, a, b, c, d, e, i, w); \
+ update_e_4(10, a, b, c, d, e, i, w); \
+ update_e_4(11, a, b, c, d, e, i, w); \
+ update_e_4(12, a, b, c, d, e, i, w); \
+ update_e_4(13, a, b, c, d, e, i, w); \
+ update_e_4(14, a, b, c, d, e, i, w); \
+ update_e_4(15, a, b, c, d, e, i, w)
+
+#define UPDATE_B(b) \
+ update_b(0, b); \
+ update_b(1, b); \
+ update_b(2, b); \
+ update_b(3, b); \
+ update_b(4, b); \
+ update_b(5, b); \
+ update_b(6, b); \
+ update_b(7, b); \
+ update_b(8, b); \
+ update_b(9, b); \
+ update_b(10, b); \
+ update_b(11, b); \
+ update_b(12, b); \
+ update_b(13, b); \
+ update_b(14, b); \
+ update_b(15, b)
+
+static inline void step00_15(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS],
+ uint32_t * ww)
+{
+ STORE_W(i, w, ww);
+ UPDATE_E1(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+}
+
+static inline void step16_19(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+ UPDATE_W(i, w);
+ UPDATE_E1(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+
+}
+
+static inline void step20_39(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+ UPDATE_W(i, w);
+ UPDATE_E2(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+}
+
+static inline void step40_59(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+ UPDATE_W(i, w);
+ UPDATE_E3(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+}
+
+static inline void step60_79(int i, uint32_t * a, uint32_t * b, uint32_t * c,
+ uint32_t * d, uint32_t * e, uint32_t(*w)[HASH_SEGS])
+{
+ UPDATE_W(i, w);
+ UPDATE_E4(a, b, c, d, e, i, w);
+ UPDATE_B(b);
+}
+
+static inline void init_abcde(uint32_t * xx, uint32_t n,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS])
+{
+ xx[0] = digests[n][0];
+ xx[1] = digests[n][1];
+ xx[2] = digests[n][2];
+ xx[3] = digests[n][3];
+ xx[4] = digests[n][4];
+ xx[5] = digests[n][5];
+ xx[6] = digests[n][6];
+ xx[7] = digests[n][7];
+ xx[8] = digests[n][8];
+ xx[9] = digests[n][9];
+ xx[10] = digests[n][10];
+ xx[11] = digests[n][11];
+ xx[12] = digests[n][12];
+ xx[13] = digests[n][13];
+ xx[14] = digests[n][14];
+ xx[15] = digests[n][15];
+}
+
+static inline void add_abcde(uint32_t * xx, uint32_t n,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS])
+{
+ digests[n][0] += xx[0];
+ digests[n][1] += xx[1];
+ digests[n][2] += xx[2];
+ digests[n][3] += xx[3];
+ digests[n][4] += xx[4];
+ digests[n][5] += xx[5];
+ digests[n][6] += xx[6];
+ digests[n][7] += xx[7];
+ digests[n][8] += xx[8];
+ digests[n][9] += xx[9];
+ digests[n][10] += xx[10];
+ digests[n][11] += xx[11];
+ digests[n][12] += xx[12];
+ digests[n][13] += xx[13];
+ digests[n][14] += xx[14];
+ digests[n][15] += xx[15];
+}
+
+/*
+ * API to perform 0-79 steps of the multi-hash algorithm for
+ * a single block of data. The caller is responsible for ensuring
+ * a full block of data input.
+ *
+ * Argument:
+ * input - the pointer to the data
+ * digest - the space to hold the digests for all segments.
+ *
+ * Return:
+ * N/A
+ */
+void mh_sha1_single(const uint8_t * input, uint32_t(*digests)[HASH_SEGS],
+ uint8_t * frame_buffer)
+{
+ uint32_t aa[HASH_SEGS], bb[HASH_SEGS], cc[HASH_SEGS], dd[HASH_SEGS], ee[HASH_SEGS];
+ uint32_t *ww = (uint32_t *) input;
+ uint32_t(*w)[HASH_SEGS];
+
+ w = (uint32_t(*)[HASH_SEGS]) frame_buffer;
+
+ init_abcde(aa, 0, digests);
+ init_abcde(bb, 1, digests);
+ init_abcde(cc, 2, digests);
+ init_abcde(dd, 3, digests);
+ init_abcde(ee, 4, digests);
+
+ step00_15(0, aa, bb, cc, dd, ee, w, ww);
+ step00_15(1, ee, aa, bb, cc, dd, w, ww);
+ step00_15(2, dd, ee, aa, bb, cc, w, ww);
+ step00_15(3, cc, dd, ee, aa, bb, w, ww);
+ step00_15(4, bb, cc, dd, ee, aa, w, ww);
+ step00_15(5, aa, bb, cc, dd, ee, w, ww);
+ step00_15(6, ee, aa, bb, cc, dd, w, ww);
+ step00_15(7, dd, ee, aa, bb, cc, w, ww);
+ step00_15(8, cc, dd, ee, aa, bb, w, ww);
+ step00_15(9, bb, cc, dd, ee, aa, w, ww);
+ step00_15(10, aa, bb, cc, dd, ee, w, ww);
+ step00_15(11, ee, aa, bb, cc, dd, w, ww);
+ step00_15(12, dd, ee, aa, bb, cc, w, ww);
+ step00_15(13, cc, dd, ee, aa, bb, w, ww);
+ step00_15(14, bb, cc, dd, ee, aa, w, ww);
+ step00_15(15, aa, bb, cc, dd, ee, w, ww);
+
+ step16_19(16, ee, aa, bb, cc, dd, w);
+ step16_19(17, dd, ee, aa, bb, cc, w);
+ step16_19(18, cc, dd, ee, aa, bb, w);
+ step16_19(19, bb, cc, dd, ee, aa, w);
+
+ step20_39(20, aa, bb, cc, dd, ee, w);
+ step20_39(21, ee, aa, bb, cc, dd, w);
+ step20_39(22, dd, ee, aa, bb, cc, w);
+ step20_39(23, cc, dd, ee, aa, bb, w);
+ step20_39(24, bb, cc, dd, ee, aa, w);
+ step20_39(25, aa, bb, cc, dd, ee, w);
+ step20_39(26, ee, aa, bb, cc, dd, w);
+ step20_39(27, dd, ee, aa, bb, cc, w);
+ step20_39(28, cc, dd, ee, aa, bb, w);
+ step20_39(29, bb, cc, dd, ee, aa, w);
+ step20_39(30, aa, bb, cc, dd, ee, w);
+ step20_39(31, ee, aa, bb, cc, dd, w);
+ step20_39(32, dd, ee, aa, bb, cc, w);
+ step20_39(33, cc, dd, ee, aa, bb, w);
+ step20_39(34, bb, cc, dd, ee, aa, w);
+ step20_39(35, aa, bb, cc, dd, ee, w);
+ step20_39(36, ee, aa, bb, cc, dd, w);
+ step20_39(37, dd, ee, aa, bb, cc, w);
+ step20_39(38, cc, dd, ee, aa, bb, w);
+ step20_39(39, bb, cc, dd, ee, aa, w);
+
+ step40_59(40, aa, bb, cc, dd, ee, w);
+ step40_59(41, ee, aa, bb, cc, dd, w);
+ step40_59(42, dd, ee, aa, bb, cc, w);
+ step40_59(43, cc, dd, ee, aa, bb, w);
+ step40_59(44, bb, cc, dd, ee, aa, w);
+ step40_59(45, aa, bb, cc, dd, ee, w);
+ step40_59(46, ee, aa, bb, cc, dd, w);
+ step40_59(47, dd, ee, aa, bb, cc, w);
+ step40_59(48, cc, dd, ee, aa, bb, w);
+ step40_59(49, bb, cc, dd, ee, aa, w);
+ step40_59(50, aa, bb, cc, dd, ee, w);
+ step40_59(51, ee, aa, bb, cc, dd, w);
+ step40_59(52, dd, ee, aa, bb, cc, w);
+ step40_59(53, cc, dd, ee, aa, bb, w);
+ step40_59(54, bb, cc, dd, ee, aa, w);
+ step40_59(55, aa, bb, cc, dd, ee, w);
+ step40_59(56, ee, aa, bb, cc, dd, w);
+ step40_59(57, dd, ee, aa, bb, cc, w);
+ step40_59(58, cc, dd, ee, aa, bb, w);
+ step40_59(59, bb, cc, dd, ee, aa, w);
+
+ step60_79(60, aa, bb, cc, dd, ee, w);
+ step60_79(61, ee, aa, bb, cc, dd, w);
+ step60_79(62, dd, ee, aa, bb, cc, w);
+ step60_79(63, cc, dd, ee, aa, bb, w);
+ step60_79(64, bb, cc, dd, ee, aa, w);
+ step60_79(65, aa, bb, cc, dd, ee, w);
+ step60_79(66, ee, aa, bb, cc, dd, w);
+ step60_79(67, dd, ee, aa, bb, cc, w);
+ step60_79(68, cc, dd, ee, aa, bb, w);
+ step60_79(69, bb, cc, dd, ee, aa, w);
+ step60_79(70, aa, bb, cc, dd, ee, w);
+ step60_79(71, ee, aa, bb, cc, dd, w);
+ step60_79(72, dd, ee, aa, bb, cc, w);
+ step60_79(73, cc, dd, ee, aa, bb, w);
+ step60_79(74, bb, cc, dd, ee, aa, w);
+ step60_79(75, aa, bb, cc, dd, ee, w);
+ step60_79(76, ee, aa, bb, cc, dd, w);
+ step60_79(77, dd, ee, aa, bb, cc, w);
+ step60_79(78, cc, dd, ee, aa, bb, w);
+ step60_79(79, bb, cc, dd, ee, aa, w);
+
+ add_abcde(aa, 0, digests);
+ add_abcde(bb, 1, digests);
+ add_abcde(cc, 2, digests);
+ add_abcde(dd, 3, digests);
+ add_abcde(ee, 4, digests);
+}
+
+void mh_sha1_block_base(const uint8_t * input_data,
+ uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks)
+{
+ uint32_t i;
+
+ for (i = 0; i < num_blocks; i++) {
+ mh_sha1_single(input_data, digests, frame_buffer);
+ input_data += MH_SHA1_BLOCK_SIZE;
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm
new file mode 100644
index 000000000..1e53cfec9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_block_sse.asm
@@ -0,0 +1,494 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using SSE
+;;
+
+%include "reg_sizes.asm"
+default rel
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regC
+ pxor %%regF,%%regD
+ pand %%regF,%%regB
+ pxor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regD
+ pxor %%regF,%%regC
+ pxor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regB
+ movdqa %%regT,%%regB
+ por %%regF,%%regC
+ pand %%regT,%%regC
+ pand %%regF,%%regD
+ por %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ pslld %%reg, %%imm
+ psrld %%tmp, (32-%%imm)
+ por %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ paddd %%regE,[%%data + (%%memW * 16)]
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 7*8 ; must be an odd multiple of 8
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ end_prolog
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg3
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp2
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+
+%define pref tmp3
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define MOVPS movups
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+
+;void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 number of 1KB blocks
+;
+global mh_sha1_block_sse:function internal
+func(mh_sha1_block_sse)
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by sse
+ and rsp, ~0x0F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 5
+ MOVPS A, [mh_digests_p + I*64 + 16*0]
+ MOVPS B, [mh_digests_p + I*64 + 16*1]
+ MOVPS C, [mh_digests_p + I*64 + 16*2]
+ MOVPS D, [mh_digests_p + I*64 + 16*3]
+
+ movdqa [rsp + I*64 + 16*0], A
+ movdqa [rsp + I*64 + 16*1], B
+ movdqa [rsp + I*64 + 16*2], C
+ movdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+ MOVPS T0,[mh_in_p + I*64+0*16]
+ MOVPS T1,[mh_in_p + I*64+1*16]
+ MOVPS T2,[mh_in_p + I*64+2*16]
+ MOVPS T3,[mh_in_p + I*64+3*16]
+
+ pshufb T0, F
+ movdqa [mh_data_p +(I)*16 +0*256],T0
+ pshufb T1, F
+ movdqa [mh_data_p +(I)*16 +1*256],T1
+ pshufb T2, F
+ movdqa [mh_data_p +(I)*16 +2*256],T2
+ pshufb T3, F
+ movdqa [mh_data_p +(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ ;; Initialize digests
+ movdqa A, [rsp + 0*64 + mh_segs]
+ movdqa B, [rsp + 1*64 + mh_segs]
+ movdqa C, [rsp + 2*64 + mh_segs]
+ movdqa D, [rsp + 3*64 + mh_segs]
+ movdqa E, [rsp + 4*64 + mh_segs]
+
+ movdqa AA, A
+ movdqa BB, B
+ movdqa CC, C
+ movdqa DD, D
+ movdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ movdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+ movdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+ movdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+;; do rounds 20...39
+ movdqa K, [K20_39]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+ movdqa K, [K40_59]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 60...79
+ movdqa K, [K60_79]
+ %rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ paddd A, AA
+ paddd B, BB
+ paddd C, CC
+ paddd D, DD
+ paddd E, EE
+
+ ; write out digests
+ movdqa [rsp + 0*64 + mh_segs], A
+ movdqa [rsp + 1*64 + mh_segs], B
+ movdqa [rsp + 2*64 + mh_segs], C
+ movdqa [rsp + 3*64 + mh_segs], D
+ movdqa [rsp + 4*64 + mh_segs], E
+
+ add pref, 256
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 5
+ movdqa A, [rsp + I*64 + 16*0]
+ movdqa B, [rsp + I*64 + 16*1]
+ movdqa C, [rsp + I*64 + 16*2]
+ movdqa D, [rsp + I*64 + 16*3]
+
+ MOVPS [mh_digests_p + I*64 + 16*0], A
+ MOVPS [mh_digests_p + I*64 + 16*1], B
+ MOVPS [mh_digests_p + I*64 + 16*2], C
+ MOVPS [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c
new file mode 100644
index 000000000..9eab755a6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_finalize_base.c
@@ -0,0 +1,122 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha1_finalize_base.c contains the prototypes of mh_sha1_finalize_XXX
+ * and mh_sha1_tail_XXX. Default definitions are base type which generates
+ * mh_sha1_finalize_base and mh_sha1_tail_base. Other types are generated
+ * through different predefined macros by mh_sha1.c.
+ * mh_sha1_tail is used to calculate the last incomplete block of input
+ * data. mh_sha1_finalize is the mh_sha1_ctx wrapper of mh_sha1_tail.
+ */
+#ifndef MH_SHA1_FINALIZE_FUNCTION
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+#define MH_SHA1_FINALIZE_FUNCTION mh_sha1_finalize_base
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_base
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_base
+#define MH_SHA1_FINALIZE_SLVER
+#endif
+
+void MH_SHA1_TAIL_FUNCTION(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], uint8_t * frame_buffer,
+ uint32_t digests[SHA1_DIGEST_WORDS])
+{
+ uint64_t partial_buffer_len, len_in_bit;
+
+ partial_buffer_len = total_len % MH_SHA1_BLOCK_SIZE;
+
+ // Padding the first block
+ partial_buffer[partial_buffer_len] = 0x80;
+ partial_buffer_len++;
+ memset(partial_buffer + partial_buffer_len, 0,
+ MH_SHA1_BLOCK_SIZE - partial_buffer_len);
+
+ // Calculate the first block without total_length if padding needs 2 block
+ if (partial_buffer_len > (MH_SHA1_BLOCK_SIZE - 8)) {
+ MH_SHA1_BLOCK_FUNCTION(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+ //Padding the second block
+ memset(partial_buffer, 0, MH_SHA1_BLOCK_SIZE);
+ }
+ //Padding the block
+ len_in_bit = bswap64((uint64_t) total_len * 8);
+ *(uint64_t *) (partial_buffer + MH_SHA1_BLOCK_SIZE - 8) = len_in_bit;
+ MH_SHA1_BLOCK_FUNCTION(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+
+ //Calculate multi-hash SHA1 digests (segment digests as input message)
+ sha1_for_mh_sha1((uint8_t *) mh_sha1_segs_digests, digests,
+ 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+int MH_SHA1_FINALIZE_FUNCTION(struct mh_sha1_ctx *ctx, void *mh_sha1_digest)
+{
+ uint8_t *partial_block_buffer;
+ uint64_t total_len;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+
+ if (ctx == NULL)
+ return MH_SHA1_CTX_ERROR_NULL;
+
+ total_len = ctx->total_length;
+ partial_block_buffer = ctx->partial_block_buffer;
+
+ /* mh_sha1 tail */
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+
+ MH_SHA1_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha1_segs_digests,
+ aligned_frame_buffer, ctx->mh_sha1_digest);
+
+ /* Output the digests of mh_sha1 */
+ if (mh_sha1_digest != NULL) {
+ ((uint32_t *) mh_sha1_digest)[0] = ctx->mh_sha1_digest[0];
+ ((uint32_t *) mh_sha1_digest)[1] = ctx->mh_sha1_digest[1];
+ ((uint32_t *) mh_sha1_digest)[2] = ctx->mh_sha1_digest[2];
+ ((uint32_t *) mh_sha1_digest)[3] = ctx->mh_sha1_digest[3];
+ ((uint32_t *) mh_sha1_digest)[4] = ctx->mh_sha1_digest[4];
+ }
+
+ return MH_SHA1_CTX_ERROR_NONE;
+}
+
+#ifdef MH_SHA1_FINALIZE_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver mh_sha1_finalize_base_slver_0000027b;
+struct slver mh_sha1_finalize_base_slver = { 0x027b, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h
new file mode 100644
index 000000000..e8f226cbc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_internal.h
@@ -0,0 +1,323 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_INTERNAL_H_
+#define _MH_SHA1_INTERNAL_H_
+
+/**
+ * @file mh_sha1_internal.h
+ * @brief mh_sha1 internal function prototypes and macros
+ *
+ * Interface for mh_sha1 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha1.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+ // 64byte pointer align
+#define ALIGN_64(pointer) ( ((uint64_t)(pointer) + 0x3F)&(~0x3F) )
+
+ /*******************************************************************
+ *mh_sha1 constants and macros
+ ******************************************************************/
+ /* mh_sha1 constants */
+#define MH_SHA1_H0 0x67452301UL
+#define MH_SHA1_H1 0xefcdab89UL
+#define MH_SHA1_H2 0x98badcfeUL
+#define MH_SHA1_H3 0x10325476UL
+#define MH_SHA1_H4 0xc3d2e1f0UL
+
+#define K_00_19 0x5a827999UL
+#define K_20_39 0x6ed9eba1UL
+#define K_40_59 0x8f1bbcdcUL
+#define K_60_79 0xca62c1d6UL
+
+ /* mh_sha1 macros */
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+
+#define bswap(x) (((x)<<24) | (((x)&0xff00)<<8) | (((x)&0xff0000)>>8) | ((x)>>24))
+#define bswap64(x) (((x)<<56) | (((x)&0xff00)<<40) | (((x)&0xff0000)<<24) | \
+ (((x)&0xff000000)<<8) | (((x)&0xff00000000ull)>>8) | \
+ (((x)&0xff0000000000ull)<<24) | \
+ (((x)&0xff000000000000ull)<<40) | \
+ (((x)&0xff00000000000000ull)<<56))
+
+ /*******************************************************************
+ * SHA1 API internal function prototypes
+ ******************************************************************/
+
+ /**
+ * @brief Performs complete SHA1 algorithm.
+ *
+ * @param input Pointer to buffer containing the input message.
+ * @param digest Pointer to digest to update.
+ * @param len Length of buffer.
+ * @returns None
+ */
+ void sha1_for_mh_sha1(const uint8_t * input_data, uint32_t * digest, const uint32_t len);
+
+ /**
+ * @brief Calculate sha1 digest of blocks which size is SHA1_BLOCK_SIZE
+ *
+ * @param data Pointer to data buffer containing the input message.
+ * @param digest Pointer to sha1 digest.
+ * @returns None
+ */
+ void sha1_single_for_mh_sha1(const uint8_t * data, uint32_t digest[]);
+
+ /*******************************************************************
+ * mh_sha1 API internal function prototypes
+ * Multiple versions of Update and Finalize functions are supplied which use
+ * multiple versions of block and tail process subfunctions.
+ ******************************************************************/
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @returns none
+ *
+ */
+ void mh_sha1_tail(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_base(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @requires SSE
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_sse(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @requires AVX
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_avx(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @requires AVX2
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_avx2(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Tail process for multi-hash sha1.
+ *
+ * Calculate the remainder of input data which is less than MH_SHA1_BLOCK_SIZE.
+ * It will output the final SHA1 digest based on mh_sha1_segs_digests.
+ *
+ * @requires AVX512
+ *
+ * @param partial_buffer Pointer to the start addr of remainder
+ * @param total_len The total length of all sections of input data.
+ * @param mh_sha1_segs_digests The digests of all 16 segments .
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param mh_sha1_digest mh_sha1 digest
+ * @returns none
+ *
+ */
+ void mh_sha1_tail_avx512(uint8_t *partial_buffer, uint32_t total_len,
+ uint32_t (*mh_sha1_segs_digests)[HASH_SEGS],
+ uint8_t *frame_buffer, uint32_t mh_sha1_digest[SHA1_DIGEST_WORDS]);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_base(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @requires SSE
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_sse(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @requires AVX
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_avx(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @requires AVX2
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_avx2(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+ /**
+ * @brief Calculate mh_sha1 digest of blocks which size is MH_SHA1_BLOCK_SIZE*N.
+ *
+ * @requires AVX512
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_block_avx512(const uint8_t * input_data, uint32_t digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm
new file mode 100644
index 000000000..83c39a315
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_multibinary.asm
@@ -0,0 +1,83 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ [bits 32]
+%else
+ default rel
+ [bits 64]
+
+ extern mh_sha1_update_sse
+ extern mh_sha1_update_avx
+ extern mh_sha1_update_avx2
+ extern mh_sha1_finalize_sse
+ extern mh_sha1_finalize_avx
+ extern mh_sha1_finalize_avx2
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ extern mh_sha1_update_avx512
+ extern mh_sha1_finalize_avx512
+ %endif
+
+%endif
+
+extern mh_sha1_update_base
+extern mh_sha1_finalize_base
+
+mbin_interface mh_sha1_update
+mbin_interface mh_sha1_finalize
+
+%ifidn __OUTPUT_FORMAT__, elf64
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 mh_sha1_update, mh_sha1_update_base, mh_sha1_update_sse, mh_sha1_update_avx, mh_sha1_update_avx2, mh_sha1_update_avx512
+ mbin_dispatch_init6 mh_sha1_finalize, mh_sha1_finalize_base, mh_sha1_finalize_sse, mh_sha1_finalize_avx, mh_sha1_finalize_avx2, mh_sha1_finalize_avx512
+ %else
+ mbin_dispatch_init5 mh_sha1_update, mh_sha1_update_base, mh_sha1_update_sse, mh_sha1_update_avx, mh_sha1_update_avx2
+ mbin_dispatch_init5 mh_sha1_finalize, mh_sha1_finalize_base, mh_sha1_finalize_sse, mh_sha1_finalize_avx, mh_sha1_finalize_avx2
+ %endif
+
+%else
+ mbin_dispatch_init2 mh_sha1_update, mh_sha1_update_base
+ mbin_dispatch_init2 mh_sha1_finalize, mh_sha1_finalize_base
+%endif
+
+;;; func core, ver, snum
+slversion mh_sha1_update, 00, 02, 0272
+slversion mh_sha1_finalize, 00, 02, 0273
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c
new file mode 100644
index 000000000..72b2a5237
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_perf.c
@@ -0,0 +1,180 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_LEN 16*1024
+# define TEST_LOOPS 20000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define TEST_LEN 32*1024*1024
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#define TEST_MEM TEST_LEN
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_CTX_ERROR_NONE){ \
+ printf("The mh_sha1 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+
+ return mh_sha1_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, fail = 0;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ struct mh_sha1_ctx *update_ctx_test = NULL, *update_ctx_base = NULL;
+ struct perf start, stop;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n");
+
+ buff = malloc(TEST_LEN);
+ update_ctx_test = malloc(sizeof(*update_ctx_test));
+ update_ctx_base = malloc(sizeof(*update_ctx_base));
+
+ if (buff == NULL || update_ctx_base == NULL || update_ctx_test == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ // mh_sha1 base version
+ mh_sha1_init(update_ctx_base);
+ mh_sha1_update_base(update_ctx_base, buff, TEST_LEN);
+ mh_sha1_finalize_base(update_ctx_base, hash_base);
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS / 10; i++) {
+ mh_sha1_init(update_ctx_base);
+ mh_sha1_update_base(update_ctx_base, buff, TEST_LEN);
+ mh_sha1_finalize_base(update_ctx_base, hash_base);
+ }
+ perf_stop(&stop);
+ printf("mh_sha1_update_base" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ //Update feature test
+ CHECK_RETURN(mh_sha1_init(update_ctx_test));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ CHECK_RETURN(mh_sha1_init(update_ctx_test));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx_test, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx_test, hash_test));
+ }
+ perf_stop(&stop);
+ printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ // Check results
+ fail = compare_digests(hash_base, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", TEST_LEN);
+ return -1;
+ }
+
+ if (fail)
+ printf("Test failed function test%d\n", fail);
+ else
+ printf("Pass func check\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c
new file mode 100644
index 000000000..fee928a9c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_ref.c
@@ -0,0 +1,438 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_internal.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+ // Macros and sub-functions which already exist in source code file
+ // (sha1_for_mh_sha1.c) is part of ISA-L library as internal functions.
+ // The reason why writing them twice is the linking issue caused by
+ // mh_sha1_ref(). mh_sha1_ref() needs these macros and sub-functions
+ // without linking ISA-L library. So mh_sha1_ref() includes them in
+ // order to contain essential sub-functions in its own object file.
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+ if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ else W(i) = bswap(ww[i]); \
+ e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+ b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+ b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+ b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+ b = rol32(b,30)
+
+void sha1_single_for_mh_sha1_ref(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e;
+ uint32_t w[16] = { 0 };
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+
+ step00_19(0, a, b, c, d, e);
+ step00_19(1, e, a, b, c, d);
+ step00_19(2, d, e, a, b, c);
+ step00_19(3, c, d, e, a, b);
+ step00_19(4, b, c, d, e, a);
+ step00_19(5, a, b, c, d, e);
+ step00_19(6, e, a, b, c, d);
+ step00_19(7, d, e, a, b, c);
+ step00_19(8, c, d, e, a, b);
+ step00_19(9, b, c, d, e, a);
+ step00_19(10, a, b, c, d, e);
+ step00_19(11, e, a, b, c, d);
+ step00_19(12, d, e, a, b, c);
+ step00_19(13, c, d, e, a, b);
+ step00_19(14, b, c, d, e, a);
+ step00_19(15, a, b, c, d, e);
+ step00_19(16, e, a, b, c, d);
+ step00_19(17, d, e, a, b, c);
+ step00_19(18, c, d, e, a, b);
+ step00_19(19, b, c, d, e, a);
+
+ step20_39(20, a, b, c, d, e);
+ step20_39(21, e, a, b, c, d);
+ step20_39(22, d, e, a, b, c);
+ step20_39(23, c, d, e, a, b);
+ step20_39(24, b, c, d, e, a);
+ step20_39(25, a, b, c, d, e);
+ step20_39(26, e, a, b, c, d);
+ step20_39(27, d, e, a, b, c);
+ step20_39(28, c, d, e, a, b);
+ step20_39(29, b, c, d, e, a);
+ step20_39(30, a, b, c, d, e);
+ step20_39(31, e, a, b, c, d);
+ step20_39(32, d, e, a, b, c);
+ step20_39(33, c, d, e, a, b);
+ step20_39(34, b, c, d, e, a);
+ step20_39(35, a, b, c, d, e);
+ step20_39(36, e, a, b, c, d);
+ step20_39(37, d, e, a, b, c);
+ step20_39(38, c, d, e, a, b);
+ step20_39(39, b, c, d, e, a);
+
+ step40_59(40, a, b, c, d, e);
+ step40_59(41, e, a, b, c, d);
+ step40_59(42, d, e, a, b, c);
+ step40_59(43, c, d, e, a, b);
+ step40_59(44, b, c, d, e, a);
+ step40_59(45, a, b, c, d, e);
+ step40_59(46, e, a, b, c, d);
+ step40_59(47, d, e, a, b, c);
+ step40_59(48, c, d, e, a, b);
+ step40_59(49, b, c, d, e, a);
+ step40_59(50, a, b, c, d, e);
+ step40_59(51, e, a, b, c, d);
+ step40_59(52, d, e, a, b, c);
+ step40_59(53, c, d, e, a, b);
+ step40_59(54, b, c, d, e, a);
+ step40_59(55, a, b, c, d, e);
+ step40_59(56, e, a, b, c, d);
+ step40_59(57, d, e, a, b, c);
+ step40_59(58, c, d, e, a, b);
+ step40_59(59, b, c, d, e, a);
+
+ step60_79(60, a, b, c, d, e);
+ step60_79(61, e, a, b, c, d);
+ step60_79(62, d, e, a, b, c);
+ step60_79(63, c, d, e, a, b);
+ step60_79(64, b, c, d, e, a);
+ step60_79(65, a, b, c, d, e);
+ step60_79(66, e, a, b, c, d);
+ step60_79(67, d, e, a, b, c);
+ step60_79(68, c, d, e, a, b);
+ step60_79(69, b, c, d, e, a);
+ step60_79(70, a, b, c, d, e);
+ step60_79(71, e, a, b, c, d);
+ step60_79(72, d, e, a, b, c);
+ step60_79(73, c, d, e, a, b);
+ step60_79(74, b, c, d, e, a);
+ step60_79(75, a, b, c, d, e);
+ step60_79(76, e, a, b, c, d);
+ step60_79(77, d, e, a, b, c);
+ step60_79(78, c, d, e, a, b);
+ step60_79(79, b, c, d, e, a);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+}
+
+void sha1_for_mh_sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA1_BLOCK_SIZE];
+ union {
+ uint64_t uint;
+ uint8_t uchar[8];
+ } convert;
+ uint8_t *p;
+
+ digest[0] = MH_SHA1_H0;
+ digest[1] = MH_SHA1_H1;
+ digest[2] = MH_SHA1_H2;
+ digest[3] = MH_SHA1_H3;
+ digest[4] = MH_SHA1_H4;
+
+ i = len;
+ while (i >= SHA1_BLOCK_SIZE) {
+ sha1_single_for_mh_sha1_ref(input_data, digest);
+ input_data += SHA1_BLOCK_SIZE;
+ i -= SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - 8); j++)
+ buf[j] = 0;
+
+ if (i > SHA1_BLOCK_SIZE - 8)
+ i = 2 * SHA1_BLOCK_SIZE;
+ else
+ i = SHA1_BLOCK_SIZE;
+
+ convert.uint = 8 * len;
+ p = buf + i - 8;
+ p[0] = convert.uchar[7];
+ p[1] = convert.uchar[6];
+ p[2] = convert.uchar[5];
+ p[3] = convert.uchar[4];
+ p[4] = convert.uchar[3];
+ p[5] = convert.uchar[2];
+ p[6] = convert.uchar[1];
+ p[7] = convert.uchar[0];
+
+ sha1_single_for_mh_sha1_ref(buf, digest);
+ if (i == (2 * SHA1_BLOCK_SIZE))
+ sha1_single_for_mh_sha1_ref(buf + SHA1_BLOCK_SIZE, digest);
+}
+
+/*
+ * buffer to rearrange one segment data from one block.
+ *
+ * Layout of new_data:
+ * segment
+ * -------------------------
+ * w0 | w1 | ... | w15
+ *
+ */
+static inline void transform_input_single(uint32_t * new_data, uint32_t * input,
+ uint32_t segment)
+{
+ new_data[16 * segment + 0] = input[16 * 0 + segment];
+ new_data[16 * segment + 1] = input[16 * 1 + segment];
+ new_data[16 * segment + 2] = input[16 * 2 + segment];
+ new_data[16 * segment + 3] = input[16 * 3 + segment];
+ new_data[16 * segment + 4] = input[16 * 4 + segment];
+ new_data[16 * segment + 5] = input[16 * 5 + segment];
+ new_data[16 * segment + 6] = input[16 * 6 + segment];
+ new_data[16 * segment + 7] = input[16 * 7 + segment];
+ new_data[16 * segment + 8] = input[16 * 8 + segment];
+ new_data[16 * segment + 9] = input[16 * 9 + segment];
+ new_data[16 * segment + 10] = input[16 * 10 + segment];
+ new_data[16 * segment + 11] = input[16 * 11 + segment];
+ new_data[16 * segment + 12] = input[16 * 12 + segment];
+ new_data[16 * segment + 13] = input[16 * 13 + segment];
+ new_data[16 * segment + 14] = input[16 * 14 + segment];
+ new_data[16 * segment + 15] = input[16 * 15 + segment];
+}
+
+// Adapt parameters to sha1_single_for_mh_sha1_ref
+#define sha1_update_one_seg(data, digest) \
+ sha1_single_for_mh_sha1_ref((const uint8_t *)(data), (uint32_t *)(digest))
+
+/*
+ * buffer to Rearrange all segments data from one block.
+ *
+ * Layout of new_data:
+ * segment
+ * -------------------------
+ * seg0: | w0 | w1 | ... | w15
+ * seg1: | w0 | w1 | ... | w15
+ * seg2: | w0 | w1 | ... | w15
+ * ....
+ * seg15: | w0 | w1 | ... | w15
+ *
+ */
+static inline void transform_input(uint32_t * new_data, uint32_t * input, uint32_t block)
+{
+ uint32_t *current_input = input + block * MH_SHA1_BLOCK_SIZE / 4;
+
+ transform_input_single(new_data, current_input, 0);
+ transform_input_single(new_data, current_input, 1);
+ transform_input_single(new_data, current_input, 2);
+ transform_input_single(new_data, current_input, 3);
+ transform_input_single(new_data, current_input, 4);
+ transform_input_single(new_data, current_input, 5);
+ transform_input_single(new_data, current_input, 6);
+ transform_input_single(new_data, current_input, 7);
+ transform_input_single(new_data, current_input, 8);
+ transform_input_single(new_data, current_input, 9);
+ transform_input_single(new_data, current_input, 10);
+ transform_input_single(new_data, current_input, 11);
+ transform_input_single(new_data, current_input, 12);
+ transform_input_single(new_data, current_input, 13);
+ transform_input_single(new_data, current_input, 14);
+ transform_input_single(new_data, current_input, 15);
+
+}
+
+/*
+ * buffer to Calculate all segments' digests from one block.
+ *
+ * Layout of seg_digest:
+ * segment
+ * -------------------------
+ * seg0: | H0 | H1 | ... | H4
+ * seg1: | H0 | H1 | ... | H4
+ * seg2: | H0 | H1 | ... | H4
+ * ....
+ * seg15: | H0 | H1 | ... | H4
+ *
+ */
+static inline void sha1_update_all_segs(uint32_t * new_data,
+ uint32_t(*mh_sha1_seg_digests)[SHA1_DIGEST_WORDS])
+{
+ sha1_update_one_seg(&(new_data)[16 * 0], mh_sha1_seg_digests[0]);
+ sha1_update_one_seg(&(new_data)[16 * 1], mh_sha1_seg_digests[1]);
+ sha1_update_one_seg(&(new_data)[16 * 2], mh_sha1_seg_digests[2]);
+ sha1_update_one_seg(&(new_data)[16 * 3], mh_sha1_seg_digests[3]);
+ sha1_update_one_seg(&(new_data)[16 * 4], mh_sha1_seg_digests[4]);
+ sha1_update_one_seg(&(new_data)[16 * 5], mh_sha1_seg_digests[5]);
+ sha1_update_one_seg(&(new_data)[16 * 6], mh_sha1_seg_digests[6]);
+ sha1_update_one_seg(&(new_data)[16 * 7], mh_sha1_seg_digests[7]);
+ sha1_update_one_seg(&(new_data)[16 * 8], mh_sha1_seg_digests[8]);
+ sha1_update_one_seg(&(new_data)[16 * 9], mh_sha1_seg_digests[9]);
+ sha1_update_one_seg(&(new_data)[16 * 10], mh_sha1_seg_digests[10]);
+ sha1_update_one_seg(&(new_data)[16 * 11], mh_sha1_seg_digests[11]);
+ sha1_update_one_seg(&(new_data)[16 * 12], mh_sha1_seg_digests[12]);
+ sha1_update_one_seg(&(new_data)[16 * 13], mh_sha1_seg_digests[13]);
+ sha1_update_one_seg(&(new_data)[16 * 14], mh_sha1_seg_digests[14]);
+ sha1_update_one_seg(&(new_data)[16 * 15], mh_sha1_seg_digests[15]);
+}
+
+void mh_sha1_block_ref(const uint8_t * input_data, uint32_t(*digests)[HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE], uint32_t num_blocks)
+{
+ uint32_t i, j;
+ uint32_t *temp_buffer = (uint32_t *) frame_buffer;
+ uint32_t(*trans_digests)[SHA1_DIGEST_WORDS];
+
+ trans_digests = (uint32_t(*)[SHA1_DIGEST_WORDS]) digests;
+
+ // Re-structure seg_digests from 5*16 to 16*5
+ for (j = 0; j < HASH_SEGS; j++) {
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ temp_buffer[j * SHA1_DIGEST_WORDS + i] = digests[i][j];
+ }
+ }
+ memcpy(trans_digests, temp_buffer, 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+ // Calculate digests for all segments, leveraging sha1 API
+ for (i = 0; i < num_blocks; i++) {
+ transform_input(temp_buffer, (uint32_t *) input_data, i);
+ sha1_update_all_segs(temp_buffer, trans_digests);
+ }
+
+ // Re-structure seg_digests from 16*5 to 5*16
+ for (j = 0; j < HASH_SEGS; j++) {
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ temp_buffer[i * HASH_SEGS + j] = trans_digests[j][i];
+ }
+ }
+ memcpy(digests, temp_buffer, 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+void mh_sha1_tail_ref(uint8_t * partial_buffer, uint32_t total_len,
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS], uint8_t * frame_buffer,
+ uint32_t digests[SHA1_DIGEST_WORDS])
+{
+ uint64_t partial_buffer_len, len_in_bit;
+
+ partial_buffer_len = total_len % MH_SHA1_BLOCK_SIZE;
+
+ // Padding the first block
+ partial_buffer[partial_buffer_len] = 0x80;
+ partial_buffer_len++;
+ memset(partial_buffer + partial_buffer_len, 0,
+ MH_SHA1_BLOCK_SIZE - partial_buffer_len);
+
+ // Calculate the first block without total_length if padding needs 2 block
+ if (partial_buffer_len > (MH_SHA1_BLOCK_SIZE - 8)) {
+ mh_sha1_block_ref(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+ //Padding the second block
+ memset(partial_buffer, 0, MH_SHA1_BLOCK_SIZE);
+ }
+ //Padding the block
+ len_in_bit = bswap64((uint64_t) total_len * 8);
+ *(uint64_t *) (partial_buffer + MH_SHA1_BLOCK_SIZE - 8) = len_in_bit;
+ mh_sha1_block_ref(partial_buffer, mh_sha1_segs_digests, frame_buffer, 1);
+
+ //Calculate multi-hash SHA1 digests (segment digests as input message)
+ sha1_for_mh_sha1_ref((uint8_t *) mh_sha1_segs_digests, digests,
+ 4 * SHA1_DIGEST_WORDS * HASH_SEGS);
+
+ return;
+}
+
+void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest)
+{
+ uint64_t total_len;
+ uint64_t num_blocks;
+ uint32_t mh_sha1_segs_digests[SHA1_DIGEST_WORDS][HASH_SEGS];
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE];
+ uint8_t partial_block_buffer[MH_SHA1_BLOCK_SIZE * 2];
+ uint32_t mh_sha1_hash_dword[SHA1_DIGEST_WORDS];
+ uint32_t i;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ /* Initialize digests of all segments */
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha1_segs_digests[0][i] = MH_SHA1_H0;
+ mh_sha1_segs_digests[1][i] = MH_SHA1_H1;
+ mh_sha1_segs_digests[2][i] = MH_SHA1_H2;
+ mh_sha1_segs_digests[3][i] = MH_SHA1_H3;
+ mh_sha1_segs_digests[4][i] = MH_SHA1_H4;
+ }
+
+ total_len = len;
+
+ // Calculate blocks
+ num_blocks = len / MH_SHA1_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ mh_sha1_block_ref(input_data, mh_sha1_segs_digests, frame_buffer, num_blocks);
+ len -= num_blocks * MH_SHA1_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA1_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ /* Finalize */
+ mh_sha1_tail_ref(partial_block_buffer, total_len, mh_sha1_segs_digests,
+ frame_buffer, mh_sha1_hash_dword);
+
+ // Output the digests of mh_sha1
+ if (mh_sha1_digest != NULL) {
+ mh_sha1_digest[0] = mh_sha1_hash_dword[0];
+ mh_sha1_digest[1] = mh_sha1_hash_dword[1];
+ mh_sha1_digest[2] = mh_sha1_hash_dword[2];
+ mh_sha1_digest[3] = mh_sha1_hash_dword[3];
+ mh_sha1_digest[4] = mh_sha1_hash_dword[4];
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c
new file mode 100644
index 000000000..0279cd709
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_test.c
@@ -0,0 +1,217 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_CTX_ERROR_NONE){ \
+ printf("The mh_sha1 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+#define MH_SHA1_REF mh_sha1_ref
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+
+ return mh_sha1_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_ref[SHA1_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int size, offset;
+ struct mh_sha1_ctx *update_ctx = NULL;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:\n");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ MH_SHA1_REF(buff, TEST_LEN, hash_ref);
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages
+ for (size = TEST_LEN; size >= 0; size--) {
+
+ // Fill with rand data
+ rand_buffer(buff, size);
+
+ MH_SHA1_REF(buff, size, hash_ref);
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various buffer offsets and sizes
+ printf("offset tests");
+ for (size = TEST_LEN - 256; size > 256; size -= 11) {
+ for (offset = 0; offset < 256; offset++) {
+ MH_SHA1_REF(buff + offset, size, hash_ref);
+
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ }
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Run efence tests
+ printf("efence tests");
+ for (size = TEST_SIZE; size > 0; size--) {
+ offset = TEST_LEN - size;
+
+ MH_SHA1_REF(buff + offset, size, hash_ref);
+
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+ printf(" %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c
new file mode 100644
index 000000000..8879b2879
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_base.c
@@ -0,0 +1,110 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+/*
+ * mh_sha1_update_base.c contains the prototype of mh_sha1_update_XXX.
+ * Default definitions are base type which generates mh_sha1_update_base.
+ * Other types are generated through different predefined macros by mh_sha1.c.
+ */
+#ifndef MH_SHA1_UPDATE_FUNCTION
+#include "mh_sha1_internal.h"
+#include <string.h>
+
+#define MH_SHA1_UPDATE_FUNCTION mh_sha1_update_base
+#define MH_SHA1_BLOCK_FUNCTION mh_sha1_block_base
+#define MH_SHA1_UPDATE_SLVER
+#endif
+
+int MH_SHA1_UPDATE_FUNCTION(struct mh_sha1_ctx *ctx, const void *buffer, uint32_t len)
+{
+
+ uint8_t *partial_block_buffer;
+ uint64_t partial_block_len;
+ uint64_t num_blocks;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ if (ctx == NULL)
+ return MH_SHA1_CTX_ERROR_NULL;
+
+ if (len == 0)
+ return MH_SHA1_CTX_ERROR_NONE;
+
+ partial_block_len = ctx->total_length % MH_SHA1_BLOCK_SIZE;
+ partial_block_buffer = ctx->partial_block_buffer;
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+
+ ctx->total_length += len;
+ // No enough input data for mh_sha1 calculation
+ if (len + partial_block_len < MH_SHA1_BLOCK_SIZE) {
+ memcpy(partial_block_buffer + partial_block_len, input_data, len);
+ return MH_SHA1_CTX_ERROR_NONE;
+ }
+ // mh_sha1 calculation for the previous partial block
+ if (partial_block_len != 0) {
+ memcpy(partial_block_buffer + partial_block_len, input_data,
+ MH_SHA1_BLOCK_SIZE - partial_block_len);
+ //do one_block process
+ MH_SHA1_BLOCK_FUNCTION(partial_block_buffer, mh_sha1_segs_digests,
+ aligned_frame_buffer, 1);
+ input_data += MH_SHA1_BLOCK_SIZE - partial_block_len;
+ len -= MH_SHA1_BLOCK_SIZE - partial_block_len;
+ memset(partial_block_buffer, 0, MH_SHA1_BLOCK_SIZE);
+ }
+ // Calculate mh_sha1 for the current blocks
+ num_blocks = len / MH_SHA1_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ MH_SHA1_BLOCK_FUNCTION(input_data, mh_sha1_segs_digests, aligned_frame_buffer,
+ num_blocks);
+ len -= num_blocks * MH_SHA1_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA1_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ return MH_SHA1_CTX_ERROR_NONE;
+
+}
+
+#ifdef MH_SHA1_UPDATE_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+ // Version info
+struct slver mh_sha1_update_base_slver_0000027a;
+struct slver mh_sha1_update_base_slver = { 0x027a, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c
new file mode 100644
index 000000000..2b1b79179
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/mh_sha1_update_test.c
@@ -0,0 +1,240 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_CTX_ERROR_NONE){ \
+ printf("The mh_sha1 function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+
+ return mh_sha1_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0, i;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_ref[SHA1_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int update_count;
+ int size1, size2, offset, addr_offset;
+ struct mh_sha1_ctx *update_ctx = NULL;
+ uint8_t *mem_addr = NULL;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages by update twice.
+ printf("\n various size messages by update twice tests");
+ for (size1 = TEST_LEN; size1 >= 0; size1--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+ // subsequent update
+ size2 = TEST_LEN - size1; // size2 is different with the former
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various update count
+ printf("\n various update count tests");
+ for (update_count = 1; update_count <= TEST_LEN; update_count++) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+ // subsequent update
+ size1 = TEST_LEN / update_count;
+ size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former
+
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ for (i = 1, offset = 0; i < update_count; i++) {
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1));
+ offset += size1;
+ }
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // test various start address of ctx.
+ printf("\n various start address of ctx test");
+ free(update_ctx);
+ mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10);
+ for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_ref(buff, TEST_LEN, hash_ref);
+
+ // a unaligned offset
+ update_ctx = (struct mh_sha1_ctx *)(mem_addr + addr_offset);
+ CHECK_RETURN(mh_sha1_init(update_ctx));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test));
+
+ fail = compare_digests(hash_ref, hash_test);
+
+ if (fail) {
+ printf("Fail addr_offset=%d\n", addr_offset);
+ return -1;
+ }
+
+ if ((addr_offset & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c
new file mode 100644
index 000000000..697155b50
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1/sha1_for_mh_sha1.c
@@ -0,0 +1,212 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha1_internal.h"
+#include <string.h>
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA1 Functions for mh_sha1
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+ if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ else W(i) = bswap(ww[i]); \
+ e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+ b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+ b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+ b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+ b = rol32(b,30)
+
+void sha1_single_for_mh_sha1(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e;
+ uint32_t w[16] = { 0 };
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+
+ step00_19(0, a, b, c, d, e);
+ step00_19(1, e, a, b, c, d);
+ step00_19(2, d, e, a, b, c);
+ step00_19(3, c, d, e, a, b);
+ step00_19(4, b, c, d, e, a);
+ step00_19(5, a, b, c, d, e);
+ step00_19(6, e, a, b, c, d);
+ step00_19(7, d, e, a, b, c);
+ step00_19(8, c, d, e, a, b);
+ step00_19(9, b, c, d, e, a);
+ step00_19(10, a, b, c, d, e);
+ step00_19(11, e, a, b, c, d);
+ step00_19(12, d, e, a, b, c);
+ step00_19(13, c, d, e, a, b);
+ step00_19(14, b, c, d, e, a);
+ step00_19(15, a, b, c, d, e);
+ step00_19(16, e, a, b, c, d);
+ step00_19(17, d, e, a, b, c);
+ step00_19(18, c, d, e, a, b);
+ step00_19(19, b, c, d, e, a);
+
+ step20_39(20, a, b, c, d, e);
+ step20_39(21, e, a, b, c, d);
+ step20_39(22, d, e, a, b, c);
+ step20_39(23, c, d, e, a, b);
+ step20_39(24, b, c, d, e, a);
+ step20_39(25, a, b, c, d, e);
+ step20_39(26, e, a, b, c, d);
+ step20_39(27, d, e, a, b, c);
+ step20_39(28, c, d, e, a, b);
+ step20_39(29, b, c, d, e, a);
+ step20_39(30, a, b, c, d, e);
+ step20_39(31, e, a, b, c, d);
+ step20_39(32, d, e, a, b, c);
+ step20_39(33, c, d, e, a, b);
+ step20_39(34, b, c, d, e, a);
+ step20_39(35, a, b, c, d, e);
+ step20_39(36, e, a, b, c, d);
+ step20_39(37, d, e, a, b, c);
+ step20_39(38, c, d, e, a, b);
+ step20_39(39, b, c, d, e, a);
+
+ step40_59(40, a, b, c, d, e);
+ step40_59(41, e, a, b, c, d);
+ step40_59(42, d, e, a, b, c);
+ step40_59(43, c, d, e, a, b);
+ step40_59(44, b, c, d, e, a);
+ step40_59(45, a, b, c, d, e);
+ step40_59(46, e, a, b, c, d);
+ step40_59(47, d, e, a, b, c);
+ step40_59(48, c, d, e, a, b);
+ step40_59(49, b, c, d, e, a);
+ step40_59(50, a, b, c, d, e);
+ step40_59(51, e, a, b, c, d);
+ step40_59(52, d, e, a, b, c);
+ step40_59(53, c, d, e, a, b);
+ step40_59(54, b, c, d, e, a);
+ step40_59(55, a, b, c, d, e);
+ step40_59(56, e, a, b, c, d);
+ step40_59(57, d, e, a, b, c);
+ step40_59(58, c, d, e, a, b);
+ step40_59(59, b, c, d, e, a);
+
+ step60_79(60, a, b, c, d, e);
+ step60_79(61, e, a, b, c, d);
+ step60_79(62, d, e, a, b, c);
+ step60_79(63, c, d, e, a, b);
+ step60_79(64, b, c, d, e, a);
+ step60_79(65, a, b, c, d, e);
+ step60_79(66, e, a, b, c, d);
+ step60_79(67, d, e, a, b, c);
+ step60_79(68, c, d, e, a, b);
+ step60_79(69, b, c, d, e, a);
+ step60_79(70, a, b, c, d, e);
+ step60_79(71, e, a, b, c, d);
+ step60_79(72, d, e, a, b, c);
+ step60_79(73, c, d, e, a, b);
+ step60_79(74, b, c, d, e, a);
+ step60_79(75, a, b, c, d, e);
+ step60_79(76, e, a, b, c, d);
+ step60_79(77, d, e, a, b, c);
+ step60_79(78, c, d, e, a, b);
+ step60_79(79, b, c, d, e, a);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+}
+
+void sha1_for_mh_sha1(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA1_BLOCK_SIZE];
+ union {
+ uint64_t uint;
+ uint8_t uchar[8];
+ } convert;
+ uint8_t *p;
+
+ digest[0] = MH_SHA1_H0;
+ digest[1] = MH_SHA1_H1;
+ digest[2] = MH_SHA1_H2;
+ digest[3] = MH_SHA1_H3;
+ digest[4] = MH_SHA1_H4;
+
+ i = len;
+ while (i >= SHA1_BLOCK_SIZE) {
+ sha1_single_for_mh_sha1(input_data, digest);
+ input_data += SHA1_BLOCK_SIZE;
+ i -= SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - 8); j++)
+ buf[j] = 0;
+
+ if (i > SHA1_BLOCK_SIZE - 8)
+ i = 2 * SHA1_BLOCK_SIZE;
+ else
+ i = SHA1_BLOCK_SIZE;
+
+ convert.uint = 8 * len;
+ p = buf + i - 8;
+ p[0] = convert.uchar[7];
+ p[1] = convert.uchar[6];
+ p[2] = convert.uchar[5];
+ p[3] = convert.uchar[4];
+ p[4] = convert.uchar[3];
+ p[5] = convert.uchar[2];
+ p[6] = convert.uchar[1];
+ p[7] = convert.uchar[0];
+
+ sha1_single_for_mh_sha1(buf, digest);
+ if (i == (2 * SHA1_BLOCK_SIZE))
+ sha1_single_for_mh_sha1(buf + SHA1_BLOCK_SIZE, digest);
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am
new file mode 100644
index 000000000..98cd59efc
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/Makefile.am
@@ -0,0 +1,71 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+#Requires unit mh_sha1
+AM_CFLAGS += -I mh_sha1
+
+lsrc_murmur = mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c
+
+lsrc_stitch = mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm
+
+lsrc_stitch += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
+
+lsrc += $(lsrc_murmur) \
+ $(lsrc_stitch)
+
+other_src += include/reg_sizes.asm \
+ include/multibinary.asm \
+ include/test.h \
+ mh_sha1/mh_sha1_internal.h \
+ mh_sha1_murmur3_x64_128/murmur3_x64_128.c \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h
+
+extern_hdrs += include/mh_sha1_murmur3_x64_128.h
+
+unit_tests += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test \
+ mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test
+
+perf_tests += mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf
+
+
+mh_sha1_murmur3_x64_128_test: mh_sha1_ref.o murmur3_x64_128.o
+mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_test_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la
+
+mh_sha1_murmur3_x64_128_update_test: mh_sha1_ref.o murmur3_x64_128.o
+mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_update_test_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la
+
+mh_sha1_murmur3_x64_128_perf: mh_sha1_ref.o murmur3_x64_128.o
+mh_sha1_murmur3_x64_128_mh_sha1_murmur3_x64_128_perf_LDADD = mh_sha1/mh_sha1_ref.lo mh_sha1_murmur3_x64_128/murmur3_x64_128.lo libisal_crypto.la
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c
new file mode 100644
index 000000000..12cb3644d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128.c
@@ -0,0 +1,151 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+int mh_sha1_murmur3_x64_128_init(struct mh_sha1_murmur3_x64_128_ctx *ctx, uint64_t murmur_seed)
+{
+ uint64_t *murmur3_x64_128_hash;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint32_t i;
+
+ if (ctx == NULL)
+ return MH_SHA1_MURMUR3_CTX_ERROR_NULL;
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+ for (i = 0; i < HASH_SEGS; i++) {
+ mh_sha1_segs_digests[0][i] = MH_SHA1_H0;
+ mh_sha1_segs_digests[1][i] = MH_SHA1_H1;
+ mh_sha1_segs_digests[2][i] = MH_SHA1_H2;
+ mh_sha1_segs_digests[3][i] = MH_SHA1_H3;
+ mh_sha1_segs_digests[4][i] = MH_SHA1_H4;
+ }
+
+ murmur3_x64_128_hash = (uint64_t *) ctx->murmur3_x64_128_digest;
+ murmur3_x64_128_hash[0] = murmur_seed;
+ murmur3_x64_128_hash[1] = murmur_seed;
+
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+}
+
+void mh_sha1_murmur3_x64_128_block_base(const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t
+ murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks)
+{
+
+ mh_sha1_block_base(input_data, mh_sha1_digests, frame_buffer, num_blocks);
+
+ murmur3_x64_128_block(input_data,
+ num_blocks * MH_SHA1_BLOCK_SIZE / MUR_BLOCK_SIZE,
+ murmur3_x64_128_digests);
+
+ return;
+}
+
+/***************mh_sha1_murmur3_x64_128_update***********/
+// mh_sha1_murmur3_x64_128_update_sse.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_sse
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_sse
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_x64_128_update_avx.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_avx
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+// mh_sha1_murmur3_x64_128_update_avx2.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx2
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_avx2
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+/***************mh_sha1_murmur3_x64_128_finalize***********/
+// mh_sha1_murmur3_x64_128_finalize_sse.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_sse
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_sse
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+// mh_sha1_murmur3_x64_128_finalize_avx.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+// mh_sha1_murmur3_x64_128_finalize_avx2.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx2
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx2
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// Version info
+struct slver mh_sha1_murmur3_x64_128_init_slver_00000251;
+struct slver mh_sha1_murmur3_x64_128_init_slver = { 0x0251, 0x00, 0x00 };
+
+// mh_sha1_murmur3_x64_128_update version info
+struct slver mh_sha1_murmur3_x64_128_update_sse_slver_00000254;
+struct slver mh_sha1_murmur3_x64_128_update_sse_slver = { 0x0254, 0x00, 0x00 };
+
+struct slver mh_sha1_murmur3_x64_128_update_avx_slver_02000256;
+struct slver mh_sha1_murmur3_x64_128_update_avx_slver = { 0x0256, 0x00, 0x02 };
+
+struct slver mh_sha1_murmur3_x64_128_update_avx2_slver_04000258;
+struct slver mh_sha1_murmur3_x64_128_update_avx2_slver = { 0x0258, 0x00, 0x04 };
+
+// mh_sha1_murmur3_x64_128_finalize version info
+struct slver mh_sha1_murmur3_x64_128_finalize_sse_slver_00000255;
+struct slver mh_sha1_murmur3_x64_128_finalize_sse_slver = { 0x0255, 0x00, 0x00 };
+
+struct slver mh_sha1_murmur3_x64_128_finalize_avx_slver_02000257;
+struct slver mh_sha1_murmur3_x64_128_finalize_avx_slver = { 0x0257, 0x00, 0x02 };
+
+struct slver mh_sha1_murmur3_x64_128_finalize_avx2_slver_04000259;
+struct slver mh_sha1_murmur3_x64_128_finalize_avx2_slver = { 0x0259, 0x00, 0x04 };
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c
new file mode 100644
index 000000000..e380a8795
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_avx512.c
@@ -0,0 +1,67 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+/***************mh_sha1_murmur3_x64_128_update***********/
+// mh_sha1_murmur3_x64_128_update_avx512.c
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_avx512
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_avx512
+#include "mh_sha1_murmur3_x64_128_update_base.c"
+#undef UPDATE_FUNCTION
+#undef BLOCK_FUNCTION
+
+/***************mh_sha1_murmur3_x64_128_finalize***********/
+// mh_sha1_murmur3_x64_128_finalize_avx512.c
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_avx512
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_avx512
+#include "mh_sha1_murmur3_x64_128_finalize_base.c"
+#undef FINALIZE_FUNCTION
+#undef MH_SHA1_TAIL_FUNCTION
+
+/***************version info***********/
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+// mh_sha1_murmur3_x64_128_update version info
+struct slver mh_sha1_murmur3_x64_128_update_avx512_slver_0600025c;
+struct slver mh_sha1_murmur3_x64_128_update_avx512_slver = { 0x025c, 0x00, 0x06 };
+
+// mh_sha1_murmur3_x64_128_finalize version info
+struct slver mh_sha1_murmur3_x64_128_finalize_avx512_slver_0600025d;
+struct slver mh_sha1_murmur3_x64_128_finalize_avx512_slver = { 0x025d, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm
new file mode 100644
index 000000000..ca35e6961
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx.asm
@@ -0,0 +1,702 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX
+;;
+
+%include "reg_sizes.asm"
+default rel
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-(%%imm))
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-(%%imm))
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[%%data + (%%memW * 16)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 256Byte.
+;; So insert 1 murmur block into every 4 SHA1_STEP_16_79.
+%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
+
+%macro SHA1_STEP_16_79_0 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ mov mur_data1, [mur_in_p]
+ mov mur_data2, [mur_in_p + 8]
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ imul mur_data1, mur_c1_r
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ imul mur_data2, mur_c2_r
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_1 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ rol mur_data1, R1
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ rol mur_data2, R2
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ imul mur_data1, mur_c2_r
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ imul mur_data2, mur_c1_r
+ PROLD %%regB,30, %%regT
+ add mur_in_p, 16
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_2 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ xor mur_hash1, mur_data1
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+ rol mur_hash1, R3
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ add mur_hash1, mur_hash2
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_3 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ xor mur_hash2, mur_data2
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 16]
+ rol mur_hash2, R4
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ add mur_hash2, mur_hash1
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8d
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbx
+ push rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10d
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ save_reg rbx, 10*16 + 6*8
+ save_reg rbp, 10*16 + 7*8
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbx, [rsp + 10*16 + 6*8]
+ mov rbp, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg4
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables of murmur3
+%define mur_in_p tmp2
+%define mur_digest_p arg3
+%define mur_hash1 tmp3
+%define mur_hash2 tmp4
+%define mur_data1 tmp5
+%define mur_data2 return
+%define mur_c1_r tmp6
+%define mur_c2_r arg5
+; constants of murmur3_x64_128
+%define R1 31
+%define R2 33
+%define R3 27
+%define R4 31
+%define M 5
+%define N1 0x52dce729;DWORD
+%define N2 0x38495ab5;DWORD
+%define C1 QWORD(0x87c37b91114253d5)
+%define C2 QWORD(0x4cf5ad432745937f)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp7
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+;void mh_sha1_murmur3_x64_128_block_avx (const uint8_t * input_data,
+; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+; uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number of 1KB blocks
+;
+global mh_sha1_murmur3_x64_128_block_avx:function internal
+func(mh_sha1_murmur3_x64_128_block_avx)
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by avx
+ and rsp, ~0x0F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 5
+ VMOVPS A, [mh_digests_p + I*64 + 16*0]
+ VMOVPS B, [mh_digests_p + I*64 + 16*1]
+ VMOVPS C, [mh_digests_p + I*64 + 16*2]
+ VMOVPS D, [mh_digests_p + I*64 + 16*3]
+
+ vmovdqa [rsp + I*64 + 16*0], A
+ vmovdqa [rsp + I*64 + 16*1], B
+ vmovdqa [rsp + I*64 + 16*2], C
+ vmovdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+ ;init murmur variables
+ mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
+ ;load murmur hash digests and multiplier
+ mov mur_hash1, [mur_digest_p]
+ mov mur_hash2, [mur_digest_p + 8]
+ mov mur_c1_r, C1
+ mov mur_c2_r, C2
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+ VMOVPS T0,[mh_in_p + I*64+0*16]
+ VMOVPS T1,[mh_in_p + I*64+1*16]
+ VMOVPS T2,[mh_in_p + I*64+2*16]
+ VMOVPS T3,[mh_in_p + I*64+3*16]
+
+ vpshufb T0, F
+ vmovdqa [mh_data_p +(I)*16 +0*256],T0
+ vpshufb T1, F
+ vmovdqa [mh_data_p +(I)*16 +1*256],T1
+ vpshufb T2, F
+ vmovdqa [mh_data_p +(I)*16 +2*256],T2
+ vpshufb T3, F
+ vmovdqa [mh_data_p +(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ .segs_loop:
+ ;; Initialize digests
+ vmovdqa A, [rsp + 0*64 + mh_segs]
+ vmovdqa B, [rsp + 1*64 + mh_segs]
+ vmovdqa C, [rsp + 2*64 + mh_segs]
+ vmovdqa D, [rsp + 3*64 + mh_segs]
+ vmovdqa E, [rsp + 4*64 + mh_segs]
+
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+ vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ vpaddd A, AA
+ vpaddd B, BB
+ vpaddd C, CC
+ vpaddd D, DD
+ vpaddd E, EE
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], A
+ vmovdqa [rsp + 1*64 + mh_segs], B
+ vmovdqa [rsp + 2*64 + mh_segs], C
+ vmovdqa [rsp + 3*64 + mh_segs], D
+ vmovdqa [rsp + 4*64 + mh_segs], E
+
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ ;store murmur-hash digest
+ mov [mur_digest_p], mur_hash1
+ mov [mur_digest_p + 8], mur_hash2
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 5
+ vmovdqa A, [rsp + I*64 + 16*0]
+ vmovdqa B, [rsp + I*64 + 16*1]
+ vmovdqa C, [rsp + I*64 + 16*2]
+ vmovdqa D, [rsp + I*64 + 16*3]
+
+ VMOVPS [mh_digests_p + I*64 + 16*0], A
+ VMOVPS [mh_digests_p + I*64 + 16*1], B
+ VMOVPS [mh_digests_p + I*64 + 16*2], C
+ VMOVPS [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm
new file mode 100644
index 000000000..c7e9a889b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx2.asm
@@ -0,0 +1,649 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX2
+;;
+
+%include "reg_sizes.asm"
+default rel
+
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-%%imm)
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[%%data + (%%memW * 32)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 512Byte.
+;; So insert 1 murmur block into every 2 SHA1_STEP_16_79.
+%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
+
+%macro SHA1_STEP_16_79_0 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
+ mov mur_data1, [mur_in_p]
+ mov mur_data2, [mur_in_p + 8]
+
+ vpsrld %%regF, W16, (32-1)
+ imul mur_data1, mur_c1_r
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ imul mur_data2, mur_c2_r
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
+ rol mur_data1, R1
+ vpaddd %%regE, %%regE,%%regF
+ rol mur_data2, R2
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ imul mur_data1, mur_c2_r
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ imul mur_data2, mur_c1_r
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+
+%macro SHA1_STEP_16_79_1 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ vpaddd %%regE, %%regE,%%immCNT
+ xor mur_hash1, mur_data1
+ vmovdqa W14, [%%data + ((%%memW - 14) & 15) * 32]
+ rol mur_hash1, R3
+ vpxor W16, W16, W14
+ add mur_hash1, mur_hash2
+ vpxor W16, W16, [%%data + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [%%data + ((%%memW - 3) & 15) * 32]
+ lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ xor mur_hash2, mur_data2
+ vpor %%regF, %%regF, W16
+ rol mur_hash2, R4
+ ROTATE_W
+
+ vmovdqa [%%data + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+ add mur_hash2, mur_hash1
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ add mur_in_p, 16
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8d
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbx
+ push rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10d
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ save_reg rbx, 10*16 + 6*8
+ save_reg rbp, 10*16 + 7*8
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbx, [rsp + 10*16 + 6*8]
+ mov rbp, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg4
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables of murmur3
+%define mur_in_p tmp2
+%define mur_digest_p arg3
+%define mur_hash1 tmp3
+%define mur_hash2 tmp4
+%define mur_data1 tmp5
+%define mur_data2 return
+%define mur_c1_r tmp6
+%define mur_c2_r arg5
+; constants of murmur3_x64_128
+%define R1 31
+%define R2 33
+%define R3 27
+%define R4 31
+%define M 5
+%define N1 0x52dce729;DWORD
+%define N2 0x38495ab5;DWORD
+%define C1 QWORD(0x87c37b91114253d5)
+%define C2 QWORD(0x4cf5ad432745937f)
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp7
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+
+%define pref tmp8
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovups
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+
+%define F ymm5
+%define T0 ymm6
+%define T1 ymm7
+%define T2 ymm8
+%define T3 ymm9
+%define T4 ymm10
+%define T5 ymm11
+%define T6 ymm12
+%define T7 ymm13
+%define T8 ymm14
+%define T9 ymm15
+
+%define AA ymm5
+%define BB ymm6
+%define CC ymm7
+%define DD ymm8
+%define EE ymm9
+%define TMP ymm10
+%define FUN ymm11
+%define K ymm12
+%define W14 ymm13
+%define W15 ymm14
+%define W16 ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+;void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data,
+; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+; uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number of 1KB blocks
+;
+global mh_sha1_murmur3_x64_128_block_avx2:function internal
+func(mh_sha1_murmur3_x64_128_block_avx2)
+ FUNC_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 32 Bytes needed by avx2
+ and rsp, ~0x1F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 2
+ VMOVPS A, [mh_digests_p + I*32*5 + 32*0]
+ VMOVPS B, [mh_digests_p + I*32*5 + 32*1]
+ VMOVPS C, [mh_digests_p + I*32*5 + 32*2]
+ VMOVPS D, [mh_digests_p + I*32*5 + 32*3]
+ VMOVPS E, [mh_digests_p + I*32*5 + 32*4]
+
+ vmovdqa [rsp + I*32*5 + 32*0], A
+ vmovdqa [rsp + I*32*5 + 32*1], B
+ vmovdqa [rsp + I*32*5 + 32*2], C
+ vmovdqa [rsp + I*32*5 + 32*3], D
+ vmovdqa [rsp + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+
+ ;init murmur variables
+ mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
+ ;load murmur hash digests and multiplier
+ mov mur_hash1, [mur_digest_p]
+ mov mur_hash2, [mur_digest_p + 8]
+ mov mur_c1_r, C1
+ mov mur_c2_r, C2
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*8_SEGS*5*2
+%assign I 0
+%rep 16
+ VMOVPS T0,[mh_in_p + I*64+0*32]
+ VMOVPS T1,[mh_in_p + I*64+1*32]
+
+ vpshufb T0, T0, F
+ vmovdqa [mh_data_p +I*32+0*512],T0
+ vpshufb T1, T1, F
+ vmovdqa [mh_data_p +I*32+1*512],T1
+%assign I (I+1)
+%endrep
+
+ mov mh_segs, 0 ;start from the first 8 segments
+ mov pref, 1024 ;avoid prefetch repeadtedly
+ .segs_loop:
+ ;; Initialize digests
+ vmovdqa A, [rsp + 0*64 + mh_segs]
+ vmovdqa B, [rsp + 1*64 + mh_segs]
+ vmovdqa C, [rsp + 2*64 + mh_segs]
+ vmovdqa D, [rsp + 3*64 + mh_segs]
+ vmovdqa E, [rsp + 4*64 + mh_segs]
+
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [mh_data_p + ((16 - 16) & 15) * 32]
+ vmovdqa W15, [mh_data_p + ((16 - 15) & 15) * 32]
+ %rep 4
+ %assign J (I % 2)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*0]
+ PREFETCH_X [mh_in_p + pref+128*1]
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+ %rep 20
+ %assign J (I % 2)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+ %rep 20
+ %assign J (I % 2)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+ PREFETCH_X [mh_in_p + pref+128*2]
+ PREFETCH_X [mh_in_p + pref+128*3]
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+ %rep 20
+ %assign J (I % 2)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ vpaddd A,A, AA
+ vpaddd B,B, BB
+ vpaddd C,C, CC
+ vpaddd D,D, DD
+ vpaddd E,E, EE
+
+ ; write out digests
+ vmovdqa [rsp + 0*64 + mh_segs], A
+ vmovdqa [rsp + 1*64 + mh_segs], B
+ vmovdqa [rsp + 2*64 + mh_segs], C
+ vmovdqa [rsp + 3*64 + mh_segs], D
+ vmovdqa [rsp + 4*64 + mh_segs], E
+
+ add pref, 512
+
+ add mh_data_p, 512
+ add mh_segs, 32
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ ;store murmur-hash digest
+ mov [mur_digest_p], mur_hash1
+ mov [mur_digest_p + 8], mur_hash2
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 2
+ vmovdqa A, [rsp + I*32*5 + 32*0]
+ vmovdqa B, [rsp + I*32*5 + 32*1]
+ vmovdqa C, [rsp + I*32*5 + 32*2]
+ vmovdqa D, [rsp + I*32*5 + 32*3]
+ vmovdqa E, [rsp + I*32*5 + 32*4]
+
+ VMOVPS [mh_digests_p + I*32*5 + 32*0], A
+ VMOVPS [mh_digests_p + I*32*5 + 32*1], B
+ VMOVPS [mh_digests_p + I*32*5 + 32*2], C
+ VMOVPS [mh_digests_p + I*32*5 + 32*3], D
+ VMOVPS [mh_digests_p + I*32*5 + 32*4], E
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=32
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
new file mode 100644
index 000000000..8cc84959e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_avx512.asm
@@ -0,0 +1,500 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using AVX-512
+;;
+
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+default rel
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define VMOVPS vmovdqu64
+;SIMD variables definition
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define HH0 zmm5
+%define HH1 zmm6
+%define HH2 zmm7
+%define HH3 zmm8
+%define HH4 zmm9
+%define KT zmm10
+%define XTMP0 zmm11
+%define XTMP1 zmm12
+%define SHUF_MASK zmm13
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;using extra 16 ZMM registers to place the inverse input data
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;macros definition
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ vmovdqa32 XTMP1, B ; Copy B
+ vpaddd E, E, %%WT ; E = E + Wt
+ vpternlogd XTMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ vprold XTMP0, A, 5 ; TMP0 = ROTL_5(A)
+ vpaddd E, E, XTMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ vprold B, B, 30 ; B = ROTL_30(B)
+ vpaddd E, E, XTMP0 ; E = T
+
+ ROTATE_ARGS
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls PROCESS_LOOP 80 and
+;; MSG_SCHED_ROUND_16_79 64 times and processes 1024 Bytes.
+;; So insert 1 murmur block per section_loop.
+%macro PROCESS_LOOP_MUR 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ mov mur_data1, [mur_in_p]
+ mov mur_data2, [mur_in_p + 8]
+ vmovdqa32 XTMP1, B ; Copy B
+ imul mur_data1, mur_c1_r
+ imul mur_data2, mur_c2_r
+ vpaddd E, E, %%WT ; E = E + Wt
+ rol mur_data1, R1
+ rol mur_data2, R2
+ vpternlogd XTMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ imul mur_data1, mur_c2_r
+ imul mur_data2, mur_c1_r
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ xor mur_hash1, mur_data1
+ add mur_in_p, 16
+ vprold XTMP0, A, 5 ; TMP0 = ROTL_5(A)
+ rol mur_hash1, R3
+ vpaddd E, E, XTMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ add mur_hash1, mur_hash2
+ vprold B, B, 30 ; B = ROTL_30(B)
+ lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+ vpaddd E, E, XTMP0 ; E = T
+ xor mur_hash2, mur_data2
+
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79_MUR 4
+%define %%WT %1
+%define %%WTp2 %2
+%define %%WTp8 %3
+%define %%WTp13 %4
+ ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+ ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+ vpternlogd %%WT, %%WTp2, %%WTp8, 0x96
+ rol mur_hash2, R4
+ vpxord %%WT, %%WT, %%WTp13
+ add mur_hash2, mur_hash1
+ lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+ vprold %%WT, %%WT, 1
+%endmacro
+
+%define APPEND(a,b) a %+ b
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8d
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbx
+ push rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10d
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ ; remove unwind info macros
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ sub rsp, stack_size
+ movdqa [rsp + 0*16], xmm6
+ movdqa [rsp + 1*16], xmm7
+ movdqa [rsp + 2*16], xmm8
+ movdqa [rsp + 3*16], xmm9
+ movdqa [rsp + 4*16], xmm10
+ movdqa [rsp + 5*16], xmm11
+ movdqa [rsp + 6*16], xmm12
+ movdqa [rsp + 7*16], xmm13
+ movdqa [rsp + 8*16], xmm14
+ movdqa [rsp + 9*16], xmm15
+ mov [rsp + 10*16 + 0*8], r12
+ mov [rsp + 10*16 + 1*8], r13
+ mov [rsp + 10*16 + 2*8], r14
+ mov [rsp + 10*16 + 3*8], r15
+ mov [rsp + 10*16 + 4*8], rdi
+ mov [rsp + 10*16 + 5*8], rsi
+ mov [rsp + 10*16 + 6*8], rbx
+ mov [rsp + 10*16 + 7*8], rbp
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbx, [rsp + 10*16 + 6*8]
+ mov rbp, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg4
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables of murmur3
+%define mur_in_p tmp2
+%define mur_digest_p arg3
+%define mur_hash1 tmp3
+%define mur_hash2 tmp4
+%define mur_data1 tmp5
+%define mur_data2 return
+%define mur_c1_r tmp6
+%define mur_c2_r arg5
+; constants of murmur3_x64_128
+%define R1 31
+%define R2 33
+%define R3 27
+%define R4 31
+%define M 5
+%define N1 0x52dce729;DWORD
+%define N2 0x38495ab5;DWORD
+%define C1 QWORD(0x87c37b91114253d5)
+%define C2 QWORD(0x4cf5ad432745937f)
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp7
+
+%define pref tmp8
+%macro PREFETCH_X 1
+%define %%mem %1
+ prefetchnta %%mem
+%endmacro
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+[bits 64]
+section .text
+align 32
+
+;void mh_sha1_murmur3_x64_128_block_avx512 (const uint8_t * input_data,
+; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+; uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number of 1KB blocks
+;
+global mh_sha1_murmur3_x64_128_block_avx512
+func(mh_sha1_murmur3_x64_128_block_avx512)
+ FUNC_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; align rsp to 64 Bytes needed by avx512
+ and rsp, ~0x3f
+
+ ; copy segs_digests into registers.
+ VMOVPS HH0, [mh_digests_p + 64*0]
+ VMOVPS HH1, [mh_digests_p + 64*1]
+ VMOVPS HH2, [mh_digests_p + 64*2]
+ VMOVPS HH3, [mh_digests_p + 64*3]
+ VMOVPS HH4, [mh_digests_p + 64*4]
+ ;a mask used to transform to big-endian data
+ vmovdqa64 SHUF_MASK, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ ;init murmur variables
+ mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
+ ;load murmur hash digests and multiplier
+ mov mur_hash1, [mur_digest_p]
+ mov mur_hash2, [mur_digest_p + 8]
+ mov mur_c1_r, C1
+ mov mur_c2_r, C2
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ ;using extra 16 ZMM registers instead of stack
+%assign I 0
+%rep 8
+%assign J (I+1)
+ VMOVPS APPEND(W,I),[mh_in_p + I*64+0*64]
+ VMOVPS APPEND(W,J),[mh_in_p + I*64+1*64]
+
+ vpshufb APPEND(W,I), APPEND(W,I), SHUF_MASK
+ vpshufb APPEND(W,J), APPEND(W,J), SHUF_MASK
+%assign I (I+2)
+%endrep
+
+ vmovdqa64 A, HH0
+ vmovdqa64 B, HH1
+ vmovdqa64 C, HH2
+ vmovdqa64 D, HH3
+ vmovdqa64 E, HH4
+
+ vmovdqa32 KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 80
+ %if N < 64 ; stitching 64 times
+ PROCESS_LOOP_MUR APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79_MUR APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %else ; 64 <= N < 80, without stitching
+ PROCESS_LOOP APPEND(W,J), I
+ %endif
+ %if N = 19
+ vmovdqa32 KT, [K20_39]
+ %assign I 0x96
+ %elif N = 39
+ vmovdqa32 KT, [K40_59]
+ %assign I 0xE8
+ %elif N = 59
+ vmovdqa32 KT, [K60_79]
+ %assign I 0x96
+ %endif
+ %if N % 20 = 19
+ PREFETCH_X [mh_in_p + 1024+128*(N / 20)]
+ PREFETCH_X [mh_in_p + 1024+128*(N / 20 +1)]
+ %endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+ ; Add old digest
+ vpaddd HH0,A, HH0
+ vpaddd HH1,B, HH1
+ vpaddd HH2,C, HH2
+ vpaddd HH3,D, HH3
+ vpaddd HH4,E, HH4
+
+ add mh_in_p, 1024
+ sub loops, 1
+ jne .block_loop
+
+ ;store murmur-hash digest
+ mov [mur_digest_p], mur_hash1
+ mov [mur_digest_p + 8], mur_hash2
+
+ ; copy segs_digests to mh_digests_p
+ VMOVPS [mh_digests_p + 64*0], HH0
+ VMOVPS [mh_digests_p + 64*1], HH1
+ VMOVPS [mh_digests_p + 64*2], HH2
+ VMOVPS [mh_digests_p + 64*3], HH3
+ VMOVPS [mh_digests_p + 64*4], HH4
+
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+
+section .data align=64
+
+align 64
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203
+ dq 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+ dq 0x5A8279995A827999
+
+K20_39: dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1
+
+K40_59: dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC
+
+K60_79: dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_murmur3_x64_128_block_avx512
+no_sha1_murmur3_x64_128_block_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm
new file mode 100644
index 000000000..174a8518f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_block_sse.asm
@@ -0,0 +1,698 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; code to compute 16 SHA1 using SSE
+;;
+
+%include "reg_sizes.asm"
+default rel
+
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regC
+ pxor %%regF,%%regD
+ pand %%regF,%%regB
+ pxor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regD
+ pxor %%regF,%%regC
+ pxor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regB
+ movdqa %%regT,%%regB
+ por %%regF,%%regC
+ pand %%regT,%%regC
+ pand %%regF,%%regD
+ por %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ pslld %%reg, %%imm
+ psrld %%tmp, (32-%%imm)
+ por %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ paddd %%regE,[%%data + (%%memW * 16)]
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%macro SHA1_STEP_16_79 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+;; Insert murmur's instructions into this macro.
+;; Every section_loop of mh_sha1 calls SHA1_STEP_16_79 64 times and processes 256Byte.
+;; So insert 1 murmur block into every 4 SHA1_STEP_16_79.
+%define SHA1_STEP_16_79(J) SHA1_STEP_16_79_ %+ J
+
+%macro SHA1_STEP_16_79_0 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ mov mur_data1, [mur_in_p]
+ mov mur_data2, [mur_in_p + 8]
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ imul mur_data1, mur_c1_r
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ imul mur_data2, mur_c2_r
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_1 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ rol mur_data1, R1
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ rol mur_data2, R2
+ psrld %%regF, (32-1)
+ por %%regF, W16
+
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ imul mur_data1, mur_c2_r
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ imul mur_data2, mur_c1_r
+ PROLD %%regB,30, %%regT
+ add mur_in_p, 16
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_2 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ xor mur_hash1, mur_data1
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ rol mur_hash1, R3
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ add mur_hash1, mur_hash2
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ lea mur_hash1, [mur_hash1 + mur_hash1*4 + N1]
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79_3 11
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+%define %%data %11
+ paddd %%regE,%%immCNT
+ movdqa W14, [%%data + ((%%memW - 14) & 15) * 16]
+ xor mur_hash2, mur_data2
+ pxor W16, W14
+ pxor W16, [%%data + ((%%memW - 8) & 15) * 16]
+ pxor W16, [%%data + ((%%memW - 3) & 15) * 16]
+ rol mur_hash2, R4
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+
+ ROTATE_W
+
+ movdqa [%%data + ((%%memW - 0) & 15) * 16],%%regF
+ add mur_hash2, mur_hash1
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ lea mur_hash2, [mur_hash2 + mur_hash2*4 + N2]
+ paddd %%regE,%%regF
+%endmacro
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define arg0 rdi
+ %define arg1 rsi
+ %define arg2 rdx
+ %define arg3 rcx
+
+ %define arg4 r8d
+ %define arg5 r9
+
+ %define tmp1 r10
+ %define tmp2 r11
+ %define tmp3 r12 ; must be saved and restored
+ %define tmp4 r13 ; must be saved and restored
+ %define tmp5 r14 ; must be saved and restored
+ %define tmp6 r15 ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define func(x) x:
+ %macro FUNC_SAVE 0
+ push r12
+ push r13
+ push r14
+ push r15
+ push rbx
+ push rbp
+ %endmacro
+ %macro FUNC_RESTORE 0
+ pop rbp
+ pop rbx
+ pop r15
+ pop r14
+ pop r13
+ pop r12
+ %endmacro
+%else
+ ; Windows
+ %define arg0 rcx
+ %define arg1 rdx
+ %define arg2 r8
+ %define arg3 r9
+
+ %define arg4 r10d
+ %define arg5 r11
+ %define tmp1 r12 ; must be saved and restored
+ %define tmp2 r13 ; must be saved and restored
+ %define tmp3 r14 ; must be saved and restored
+ %define tmp4 r15 ; must be saved and restored
+ %define tmp5 rdi ; must be saved and restored
+ %define tmp6 rsi ; must be saved and restored
+ %define tmp7 rbx ; must be saved and restored
+ %define tmp8 rbp ; must be saved and restored
+ %define return rax
+
+ %define stack_size 10*16 + 9*8 ; must be an odd multiple of 8
+ %define PS 8
+ %define arg(x) [rsp + stack_size + PS + PS*x]
+ %define func(x) proc_frame x
+ %macro FUNC_SAVE 0
+ alloc_stack stack_size
+ save_xmm128 xmm6, 0*16
+ save_xmm128 xmm7, 1*16
+ save_xmm128 xmm8, 2*16
+ save_xmm128 xmm9, 3*16
+ save_xmm128 xmm10, 4*16
+ save_xmm128 xmm11, 5*16
+ save_xmm128 xmm12, 6*16
+ save_xmm128 xmm13, 7*16
+ save_xmm128 xmm14, 8*16
+ save_xmm128 xmm15, 9*16
+ save_reg r12, 10*16 + 0*8
+ save_reg r13, 10*16 + 1*8
+ save_reg r14, 10*16 + 2*8
+ save_reg r15, 10*16 + 3*8
+ save_reg rdi, 10*16 + 4*8
+ save_reg rsi, 10*16 + 5*8
+ save_reg rbx, 10*16 + 6*8
+ save_reg rbp, 10*16 + 7*8
+ end_prolog
+ mov arg4, arg(4)
+ %endmacro
+
+ %macro FUNC_RESTORE 0
+ movdqa xmm6, [rsp + 0*16]
+ movdqa xmm7, [rsp + 1*16]
+ movdqa xmm8, [rsp + 2*16]
+ movdqa xmm9, [rsp + 3*16]
+ movdqa xmm10, [rsp + 4*16]
+ movdqa xmm11, [rsp + 5*16]
+ movdqa xmm12, [rsp + 6*16]
+ movdqa xmm13, [rsp + 7*16]
+ movdqa xmm14, [rsp + 8*16]
+ movdqa xmm15, [rsp + 9*16]
+ mov r12, [rsp + 10*16 + 0*8]
+ mov r13, [rsp + 10*16 + 1*8]
+ mov r14, [rsp + 10*16 + 2*8]
+ mov r15, [rsp + 10*16 + 3*8]
+ mov rdi, [rsp + 10*16 + 4*8]
+ mov rsi, [rsp + 10*16 + 5*8]
+ mov rbx, [rsp + 10*16 + 6*8]
+ mov rbp, [rsp + 10*16 + 7*8]
+ add rsp, stack_size
+ %endmacro
+%endif
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define loops arg4
+;variables of mh_sha1
+%define mh_in_p arg0
+%define mh_digests_p arg1
+%define mh_data_p arg2
+%define mh_segs tmp1
+;variables of murmur3
+%define mur_in_p tmp2
+%define mur_digest_p arg3
+%define mur_hash1 tmp3
+%define mur_hash2 tmp4
+%define mur_data1 tmp5
+%define mur_data2 return
+%define mur_c1_r tmp6
+%define mur_c2_r arg5
+; constants of murmur3_x64_128
+%define R1 31
+%define R2 33
+%define R3 27
+%define R4 31
+%define M 5
+%define N1 0x52dce729;DWORD
+%define N2 0x38495ab5;DWORD
+%define C1 QWORD(0x87c37b91114253d5)
+%define C2 QWORD(0x4cf5ad432745937f)
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;variables used by storing segs_digests on stack
+%define RSP_SAVE tmp7
+%define FRAMESZ 4*5*16 ;BYTES*DWORDS*SEGS
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%define MOVPS movups
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+
+;init hash digests
+; segs_digests:low addr-> high_addr
+; a | b | c | ...| p | (16)
+; h0 | h0 | h0 | ...| h0 | | Aa| Ab | Ac |...| Ap |
+; h1 | h1 | h1 | ...| h1 | | Ba| Bb | Bc |...| Bp |
+; ....
+; h5 | h5 | h5 | ...| h5 | | Ea| Eb | Ec |...| Ep |
+
+align 32
+;void mh_sha1_murmur3_x64_128_block_sse (const uint8_t * input_data,
+; uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+; uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+; uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+; uint32_t num_blocks);
+; arg 0 pointer to input data
+; arg 1 pointer to digests, include segments digests(uint32_t digests[16][5])
+; arg 2 pointer to aligned_frame_buffer which is used to save the big_endian data.
+; arg 3 pointer to murmur3 digest
+; arg 4 number of 1KB blocks
+;
+global mh_sha1_murmur3_x64_128_block_sse:function internal
+func(mh_sha1_murmur3_x64_128_block_sse)
+ FUNC_SAVE
+ ; save rsp
+ mov RSP_SAVE, rsp
+
+ cmp loops, 0
+ jle .return
+
+ ; leave enough space to store segs_digests
+ sub rsp, FRAMESZ
+ ; align rsp to 16 Bytes needed by sse
+ and rsp, ~0x0F
+
+ %assign I 0 ; copy segs_digests into stack
+ %rep 5
+ MOVPS A, [mh_digests_p + I*64 + 16*0]
+ MOVPS B, [mh_digests_p + I*64 + 16*1]
+ MOVPS C, [mh_digests_p + I*64 + 16*2]
+ MOVPS D, [mh_digests_p + I*64 + 16*3]
+
+ movdqa [rsp + I*64 + 16*0], A
+ movdqa [rsp + I*64 + 16*1], B
+ movdqa [rsp + I*64 + 16*2], C
+ movdqa [rsp + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+
+ ;init murmur variables
+ mov mur_in_p, mh_in_p ;different steps between murmur and mh_sha1
+ ;load murmur hash digests and multiplier
+ mov mur_hash1, [mur_digest_p]
+ mov mur_hash2, [mur_digest_p + 8]
+ mov mur_c1_r, C1
+ mov mur_c2_r, C2
+
+.block_loop:
+ ;transform to big-endian data and store on aligned_frame
+ movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+ ;transform input data from DWORD*16_SEGS*5 to DWORD*4_SEGS*5*4
+ %assign I 0
+ %rep 16
+ MOVPS T0,[mh_in_p+I*64+0*16]
+ MOVPS T1,[mh_in_p+I*64+1*16]
+ MOVPS T2,[mh_in_p+I*64+2*16]
+ MOVPS T3,[mh_in_p+I*64+3*16]
+
+ pshufb T0, F
+ movdqa [mh_data_p+(I)*16 +0*256],T0
+ pshufb T1, F
+ movdqa [mh_data_p+(I)*16 +1*256],T1
+ pshufb T2, F
+ movdqa [mh_data_p+(I)*16 +2*256],T2
+ pshufb T3, F
+ movdqa [mh_data_p+(I)*16 +3*256],T3
+ %assign I (I+1)
+ %endrep
+
+ mov mh_segs, 0 ;start from the first 4 segments
+ .segs_loop:
+ ;; Initialize digests
+ movdqa A, [rsp + 0*64 + mh_segs]
+ movdqa B, [rsp + 1*64 + mh_segs]
+ movdqa C, [rsp + 2*64 + mh_segs]
+ movdqa D, [rsp + 3*64 + mh_segs]
+ movdqa E, [rsp + 4*64 + mh_segs]
+
+ movdqa AA, A
+ movdqa BB, B
+ movdqa CC, C
+ movdqa DD, D
+ movdqa EE, E
+;;
+;; perform 0-79 steps
+;;
+ movdqa K, [K00_19]
+;; do rounds 0...15
+ %assign I 0
+ %rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 16...19
+ movdqa W16, [mh_data_p + ((16 - 16) & 15) * 16]
+ movdqa W15, [mh_data_p + ((16 - 15) & 15) * 16]
+ %rep 4
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 20...39
+ movdqa K, [K20_39]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 40...59
+ movdqa K, [K40_59]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+;; do rounds 60...79
+ movdqa K, [K60_79]
+ %rep 20
+ %assign J (I % 4)
+ SHA1_STEP_16_79(J) A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3, mh_data_p
+ ROTATE_ARGS
+ %assign I (I+1)
+ %endrep
+
+ paddd A, AA
+ paddd B, BB
+ paddd C, CC
+ paddd D, DD
+ paddd E, EE
+
+ ; write out digests
+ movdqa [rsp + 0*64 + mh_segs], A
+ movdqa [rsp + 1*64 + mh_segs], B
+ movdqa [rsp + 2*64 + mh_segs], C
+ movdqa [rsp + 3*64 + mh_segs], D
+ movdqa [rsp + 4*64 + mh_segs], E
+
+ add mh_data_p, 256
+ add mh_segs, 16
+ cmp mh_segs, 64
+ jc .segs_loop
+
+ sub mh_data_p, (1024)
+ add mh_in_p, (1024)
+ sub loops, 1
+ jne .block_loop
+
+ ;store murmur-hash digest
+ mov [mur_digest_p], mur_hash1
+ mov [mur_digest_p + 8], mur_hash2
+
+ %assign I 0 ; copy segs_digests back to mh_digests_p
+ %rep 5
+ movdqa A, [rsp + I*64 + 16*0]
+ movdqa B, [rsp + I*64 + 16*1]
+ movdqa C, [rsp + I*64 + 16*2]
+ movdqa D, [rsp + I*64 + 16*3]
+
+ MOVPS [mh_digests_p + I*64 + 16*0], A
+ MOVPS [mh_digests_p + I*64 + 16*1], B
+ MOVPS [mh_digests_p + I*64 + 16*2], C
+ MOVPS [mh_digests_p + I*64 + 16*3], D
+ %assign I (I+1)
+ %endrep
+ mov rsp, RSP_SAVE ; restore rsp
+
+.return:
+ FUNC_RESTORE
+ ret
+
+endproc_frame
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c
new file mode 100644
index 000000000..6eb998257
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_finalize_base.c
@@ -0,0 +1,102 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef FINALIZE_FUNCTION
+#include <stdlib.h> // For NULL
+#include "mh_sha1_murmur3_x64_128_internal.h"
+
+#define FINALIZE_FUNCTION mh_sha1_murmur3_x64_128_finalize_base
+#define MH_SHA1_TAIL_FUNCTION mh_sha1_tail_base
+#define FINALIZE_FUNCTION_SLVER
+#endif
+
+#define MURMUR_BLOCK_FUNCTION murmur3_x64_128_block
+#define MURMUR_TAIL_FUNCTION murmur3_x64_128_tail
+
+int FINALIZE_FUNCTION(struct mh_sha1_murmur3_x64_128_ctx *ctx, void *mh_sha1_digest,
+ void *murmur3_x64_128_digest)
+{
+ uint8_t *partial_block_buffer, *murmur_tail_data;
+ uint64_t partial_block_len, total_len;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+
+ if (ctx == NULL)
+ return MH_SHA1_MURMUR3_CTX_ERROR_NULL;
+
+ total_len = ctx->total_length;
+ partial_block_len = total_len % MH_SHA1_BLOCK_SIZE;
+ partial_block_buffer = ctx->partial_block_buffer;
+
+ // Calculate murmur3 firstly
+ // because mh_sha1 will change the partial_block_buffer
+ // ( partial_block_buffer = n murmur3 blocks and 1 murmur3 tail)
+ murmur_tail_data =
+ partial_block_buffer + partial_block_len - partial_block_len % MUR_BLOCK_SIZE;
+ MURMUR_BLOCK_FUNCTION(partial_block_buffer, partial_block_len / MUR_BLOCK_SIZE,
+ ctx->murmur3_x64_128_digest);
+ MURMUR_TAIL_FUNCTION(murmur_tail_data, total_len, ctx->murmur3_x64_128_digest);
+
+ /* mh_sha1 final */
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+
+ MH_SHA1_TAIL_FUNCTION(partial_block_buffer, total_len, mh_sha1_segs_digests,
+ aligned_frame_buffer, ctx->mh_sha1_digest);
+
+ /* Output the digests of murmur3 and mh_sha1 */
+ if (mh_sha1_digest != NULL) {
+ ((uint32_t *) mh_sha1_digest)[0] = ctx->mh_sha1_digest[0];
+ ((uint32_t *) mh_sha1_digest)[1] = ctx->mh_sha1_digest[1];
+ ((uint32_t *) mh_sha1_digest)[2] = ctx->mh_sha1_digest[2];
+ ((uint32_t *) mh_sha1_digest)[3] = ctx->mh_sha1_digest[3];
+ ((uint32_t *) mh_sha1_digest)[4] = ctx->mh_sha1_digest[4];
+ }
+
+ if (murmur3_x64_128_digest != NULL) {
+ ((uint32_t *) murmur3_x64_128_digest)[0] = ctx->murmur3_x64_128_digest[0];
+ ((uint32_t *) murmur3_x64_128_digest)[1] = ctx->murmur3_x64_128_digest[1];
+ ((uint32_t *) murmur3_x64_128_digest)[2] = ctx->murmur3_x64_128_digest[2];
+ ((uint32_t *) murmur3_x64_128_digest)[3] = ctx->murmur3_x64_128_digest[3];
+ }
+
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+}
+
+#ifdef FINALIZE_FUNCTION_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+ // Version info
+struct slver mh_sha1_murmur3_x64_128_finalize_base_slver_0000025b;
+struct slver mh_sha1_murmur3_x64_128_finalize_base_slver = { 0x025b, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h
new file mode 100644
index 000000000..bb16c58d6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_internal.h
@@ -0,0 +1,202 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef _MH_SHA1_MURMUR3_X64_128_INTERNAL_H_
+#define _MH_SHA1_MURMUR3_X64_128_INTERNAL_H_
+
+/**
+ * @file mh_sha1_murmur3_x64_128_internal.h
+ * @brief mh_sha1_murmur3_x64_128 internal function prototypes and macros
+ *
+ * Interface for mh_sha1_murmur3_x64_128 internal functions
+ *
+ */
+#include <stdint.h>
+#include "mh_sha1_internal.h"
+#include "mh_sha1_murmur3_x64_128.h"
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+#ifdef _MSC_VER
+# define inline __inline
+#endif
+
+ /*******************************************************************
+ * mh_sha1_murmur3_x64_128 API internal function prototypes
+ * Multiple versions of Update and Finalize functions are supplied which use
+ * multiple versions of block and tail process subfunctions.
+ ******************************************************************/
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * This function determines what instruction sets are enabled and selects the
+ * appropriate version at runtime.
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ // Each function needs an individual C or ASM file because they impact performance much.
+ //They will be called by mh_sha1_murmur3_x64_128_update_XXX.
+ void mh_sha1_murmur3_x64_128_block (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_base (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires SSE
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_sse (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires AVX
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_avx (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires AVX2
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_avx2 (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+
+ /**
+ * @brief Calculate blocks which size is MH_SHA1_BLOCK_SIZE*N
+ *
+ * @requires AVX512
+ *
+ * @param input_data Pointer to input data to be processed
+ * @param mh_sha1_digests 16 segments digests
+ * @param frame_buffer Pointer to buffer which is a temp working area
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @param num_blocks The number of blocks.
+ * @returns none
+ *
+ */
+ void mh_sha1_murmur3_x64_128_block_avx512 (const uint8_t * input_data,
+ uint32_t mh_sha1_digests[SHA1_DIGEST_WORDS][HASH_SEGS],
+ uint8_t frame_buffer[MH_SHA1_BLOCK_SIZE],
+ uint32_t murmur3_x64_128_digests[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t num_blocks);
+ /*******************************************************************
+ * murmur hash API
+ ******************************************************************/
+
+ /**
+ * @brief Calculate murmur digest of blocks which size is 16*N.
+ * @param input_data Pointer to input data to be processed
+ * @param num_blocks The number of blocks which size is 16.
+ * @param murmur3_x64_128_digests Murmur3 digest
+ * @returns none
+ *
+ */
+ void murmur3_x64_128_block(const uint8_t * input_data, uint32_t num_blocks,
+ uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]);
+
+ /**
+ * @brief Do the tail process which is less than 16Byte.
+ * @param tail_buffer Pointer to input data to be processed
+ * @param total_len The total length of the input_data
+ * @param digests Murmur3 digest
+ * @returns none
+ *
+ */
+ void murmur3_x64_128_tail(const uint8_t * tail_buffer, uint32_t total_len,
+ uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS]);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm
new file mode 100644
index 000000000..96502c32c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_multibinary.asm
@@ -0,0 +1,82 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+%ifidn __OUTPUT_FORMAT__, elf32
+ [bits 32]
+%else
+ default rel
+ [bits 64]
+
+ extern mh_sha1_murmur3_x64_128_update_sse
+ extern mh_sha1_murmur3_x64_128_update_avx
+ extern mh_sha1_murmur3_x64_128_update_avx2
+ extern mh_sha1_murmur3_x64_128_finalize_sse
+ extern mh_sha1_murmur3_x64_128_finalize_avx
+ extern mh_sha1_murmur3_x64_128_finalize_avx2
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ extern mh_sha1_murmur3_x64_128_update_avx512
+ extern mh_sha1_murmur3_x64_128_finalize_avx512
+ %endif
+
+%endif
+
+extern mh_sha1_murmur3_x64_128_update_base
+extern mh_sha1_murmur3_x64_128_finalize_base
+
+mbin_interface mh_sha1_murmur3_x64_128_update
+mbin_interface mh_sha1_murmur3_x64_128_finalize
+
+%ifidn __OUTPUT_FORMAT__, elf64
+
+ %ifdef HAVE_AS_KNOWS_AVX512
+ mbin_dispatch_init6 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base, mh_sha1_murmur3_x64_128_update_sse, mh_sha1_murmur3_x64_128_update_avx, mh_sha1_murmur3_x64_128_update_avx2, mh_sha1_murmur3_x64_128_update_avx512
+ mbin_dispatch_init6 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base, mh_sha1_murmur3_x64_128_finalize_sse, mh_sha1_murmur3_x64_128_finalize_avx, mh_sha1_murmur3_x64_128_finalize_avx2, mh_sha1_murmur3_x64_128_finalize_avx512
+ %else
+ mbin_dispatch_init5 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base, mh_sha1_murmur3_x64_128_update_sse, mh_sha1_murmur3_x64_128_update_avx, mh_sha1_murmur3_x64_128_update_avx2
+ mbin_dispatch_init5 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base, mh_sha1_murmur3_x64_128_finalize_sse, mh_sha1_murmur3_x64_128_finalize_avx, mh_sha1_murmur3_x64_128_finalize_avx2
+ %endif
+
+%else
+ mbin_dispatch_init2 mh_sha1_murmur3_x64_128_update, mh_sha1_murmur3_x64_128_update_base
+ mbin_dispatch_init2 mh_sha1_murmur3_x64_128_finalize, mh_sha1_murmur3_x64_128_finalize_base
+%endif
+
+;;; func core, ver, snum
+slversion mh_sha1_murmur3_x64_128_update, 00, 02, 0252
+slversion mh_sha1_murmur3_x64_128_finalize, 00, 02, 0253
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c
new file mode 100644
index 000000000..8a17fdfd7
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_perf.c
@@ -0,0 +1,206 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1_murmur3_x64_128.h"
+#include "test.h"
+
+//#define CACHED_TEST
+#ifdef CACHED_TEST
+// Loop many times over same
+# define TEST_LEN 16*1024
+# define TEST_LOOPS 20000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define TEST_LEN 32*1024*1024
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+#define TEST_MEM TEST_LEN
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \
+ printf("The stitch function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * murmur3_x64_128_digest);
+
+void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest)
+{
+ mh_sha1_ref(buffer, len, mh_sha1_digest);
+ murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest);
+
+ return;
+}
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS],
+ uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+ int murmur3_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) {
+ if (murmur3_test[i] != murmur3_base[i])
+ murmur3_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+ if (murmur3_fail) {
+ printf("murmur3 fail test\n");
+ printf("base: ");
+ dump((char *)murmur3_base, 16);
+ printf("ref: ");
+ dump((char *)murmur3_test, 16);
+ }
+
+ return mh_sha1_fail + murmur3_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int i, fail = 0;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS],
+ murmur3_base[MURMUR3_x64_128_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL;
+ struct perf start, stop;
+
+ printf(xstr(TEST_UPDATE_FUNCTION) "_perf:\n");
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ // mh_sha1_murmur3 base version
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base);
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS / 10; i++) {
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+ murmur3_base);
+ }
+ perf_stop(&stop);
+ printf("mh_sha1_murmur3_x64_128_base" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ //Update feature test
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ perf_start(&start);
+ for (i = 0; i < TEST_LOOPS; i++) {
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+ }
+ perf_stop(&stop);
+ printf(xstr(TEST_UPDATE_FUNCTION) TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_MEM * i);
+
+ // Check results
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", TEST_LEN);
+ return -1;
+ }
+
+ if (fail)
+ printf("Test failed function test%d\n", fail);
+ else
+ printf("Pass func check\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c
new file mode 100644
index 000000000..a2ea8ce92
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_test.c
@@ -0,0 +1,248 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1_murmur3_x64_128.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \
+ printf("The stitch function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * murmur3_x64_128_digest);
+
+void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest)
+{
+ mh_sha1_ref(buffer, len, mh_sha1_digest);
+ murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest);
+
+ return;
+}
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS],
+ uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+ int murmur3_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) {
+ if (murmur3_test[i] != murmur3_base[i])
+ murmur3_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+ if (murmur3_fail) {
+ printf("murmur3 fail test\n");
+ printf("base: ");
+ dump((char *)murmur3_base, 16);
+ printf("ref: ");
+ dump((char *)murmur3_test, 16);
+ }
+
+ return mh_sha1_fail + murmur3_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS],
+ murmur3_base[MURMUR3_x64_128_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int size, offset;
+ struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL;
+
+ printf(" " xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages
+ for (size = TEST_LEN; size >= 0; size--) {
+
+ // Fill with rand data
+ rand_buffer(buff, size);
+
+ mh_sha1_murmur3_x64_128_base(buff, size, TEST_SEED, hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size=%d\n", size);
+ return -1;
+ }
+
+ if ((size & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various buffer offsets and sizes
+ printf("offset tests");
+ for (size = TEST_LEN - 256; size > 256; size -= 11) {
+ for (offset = 0; offset < 256; offset++) {
+ mh_sha1_murmur3_x64_128_base(buff + offset, size, TEST_SEED,
+ hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail =
+ compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size=%d offset=%d\n", size, offset);
+ return -1;
+ }
+
+ }
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Run efence tests
+ printf("efence tests");
+ for (size = TEST_SIZE; size > 0; size--) {
+ offset = TEST_LEN - size;
+ mh_sha1_murmur3_x64_128_base(buff + offset, size, TEST_SEED,
+ hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size=%d offset=%d\n", size, offset);
+ return -1;
+ }
+
+ if ((size & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c
new file mode 100644
index 000000000..e8d21ac26
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_base.c
@@ -0,0 +1,107 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#ifndef UPDATE_FUNCTION
+#include "mh_sha1_murmur3_x64_128_internal.h"
+#include <string.h>
+
+#define UPDATE_FUNCTION mh_sha1_murmur3_x64_128_update_base
+#define BLOCK_FUNCTION mh_sha1_murmur3_x64_128_block_base
+#define UPDATE_FUNCTION_SLVER
+#endif
+
+int UPDATE_FUNCTION(struct mh_sha1_murmur3_x64_128_ctx *ctx, const void *buffer, uint32_t len)
+{
+
+ uint8_t *partial_block_buffer;
+ uint64_t partial_block_len;
+ uint64_t num_blocks;
+ uint32_t(*mh_sha1_segs_digests)[HASH_SEGS];
+ uint8_t *aligned_frame_buffer;
+ uint32_t *murmur3_x64_128_digest;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ if (ctx == NULL)
+ return MH_SHA1_MURMUR3_CTX_ERROR_NULL;
+
+ if (len == 0)
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+
+ partial_block_len = ctx->total_length % MH_SHA1_BLOCK_SIZE;
+ partial_block_buffer = ctx->partial_block_buffer;
+ aligned_frame_buffer = (uint8_t *) ALIGN_64(ctx->frame_buffer);
+ mh_sha1_segs_digests = (uint32_t(*)[HASH_SEGS]) ctx->mh_sha1_interim_digests;
+ murmur3_x64_128_digest = ctx->murmur3_x64_128_digest;
+
+ ctx->total_length += len;
+ // No enough input data for mh_sha1 calculation
+ if (len + partial_block_len < MH_SHA1_BLOCK_SIZE) {
+ memcpy(partial_block_buffer + partial_block_len, input_data, len);
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+ }
+ // mh_sha1 calculation for the previous partial block
+ if (partial_block_len != 0) {
+ memcpy(partial_block_buffer + partial_block_len, input_data,
+ MH_SHA1_BLOCK_SIZE - partial_block_len);
+ //do one_block process
+ BLOCK_FUNCTION(partial_block_buffer, mh_sha1_segs_digests,
+ aligned_frame_buffer, murmur3_x64_128_digest, 1);
+ input_data += MH_SHA1_BLOCK_SIZE - partial_block_len;
+ len -= MH_SHA1_BLOCK_SIZE - partial_block_len;
+ memset(partial_block_buffer, 0, MH_SHA1_BLOCK_SIZE);
+ }
+ // Calculate mh_sha1 for the current blocks
+ num_blocks = len / MH_SHA1_BLOCK_SIZE;
+ if (num_blocks > 0) {
+ //do num_blocks process
+ BLOCK_FUNCTION(input_data, mh_sha1_segs_digests, aligned_frame_buffer,
+ murmur3_x64_128_digest, num_blocks);
+ len -= num_blocks * MH_SHA1_BLOCK_SIZE;
+ input_data += num_blocks * MH_SHA1_BLOCK_SIZE;
+ }
+ // Store the partial block
+ if (len != 0) {
+ memcpy(partial_block_buffer, input_data, len);
+ }
+
+ return MH_SHA1_MURMUR3_CTX_ERROR_NONE;
+
+}
+
+#ifdef UPDATE_FUNCTION_SLVER
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+
+ // Version info
+struct slver mh_sha1_murmur3_x64_128_update_base_slver_0000025a;
+struct slver mh_sha1_murmur3_x64_128_update_base_slver = { 0x025a, 0x00, 0x00 };
+#endif
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c
new file mode 100644
index 000000000..853e330a4
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/mh_sha1_murmur3_x64_128_update_test.c
@@ -0,0 +1,272 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "mh_sha1_murmur3_x64_128.h"
+
+#define TEST_LEN 16*1024
+#define TEST_SIZE 8*1024
+#define TEST_MEM TEST_LEN
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define str(s) #s
+#define xstr(s) str(s)
+
+#define _FUNC_TOKEN(func, type) func##type
+#define FUNC_TOKEN(func, type) _FUNC_TOKEN(func, type)
+
+#ifndef MH_SHA1_FUNC_TYPE
+#define MH_SHA1_FUNC_TYPE
+#endif
+
+#define TEST_UPDATE_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_update, MH_SHA1_FUNC_TYPE)
+#define TEST_FINAL_FUNCTION FUNC_TOKEN(mh_sha1_murmur3_x64_128_finalize, MH_SHA1_FUNC_TYPE)
+
+#define CHECK_RETURN(state) do{ \
+ if((state) != MH_SHA1_MURMUR3_CTX_ERROR_NONE){ \
+ printf("The stitch function is failed.\n"); \
+ return 1; \
+ } \
+ }while(0)
+
+extern void mh_sha1_ref(const void *buffer, uint32_t len, uint32_t * mh_sha1_digest);
+
+extern void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * murmur3_x64_128_digest);
+
+void mh_sha1_murmur3_x64_128_base(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * mh_sha1_digest, uint32_t * murmur3_x64_128_digest)
+{
+ mh_sha1_ref(buffer, len, mh_sha1_digest);
+ murmur3_x64_128(buffer, len, murmur_seed, murmur3_x64_128_digest);
+
+ return;
+}
+
+// Generates pseudo-random data
+void rand_buffer(uint8_t * buf, long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+void dump(char *buf, int len)
+{
+ int i;
+ for (i = 0; i < len;) {
+ printf(" %2x", 0xff & buf[i++]);
+ if (i % 20 == 0)
+ printf("\n");
+ }
+ if (i % 20 != 0)
+ printf("\n");
+}
+
+int compare_digests(uint32_t hash_base[SHA1_DIGEST_WORDS],
+ uint32_t hash_test[SHA1_DIGEST_WORDS],
+ uint32_t murmur3_base[MURMUR3_x64_128_DIGEST_WORDS],
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ int i;
+ int mh_sha1_fail = 0;
+ int murmur3_fail = 0;
+
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++) {
+ if (hash_test[i] != hash_base[i])
+ mh_sha1_fail++;
+ }
+
+ for (i = 0; i < MURMUR3_x64_128_DIGEST_WORDS; i++) {
+ if (murmur3_test[i] != murmur3_base[i])
+ murmur3_fail++;
+ }
+
+ if (mh_sha1_fail) {
+ printf("mh_sha1 fail test\n");
+ printf("base: ");
+ dump((char *)hash_base, 20);
+ printf("ref: ");
+ dump((char *)hash_test, 20);
+ }
+ if (murmur3_fail) {
+ printf("murmur3 fail test\n");
+ printf("base: ");
+ dump((char *)murmur3_base, 16);
+ printf("ref: ");
+ dump((char *)murmur3_test, 16);
+ }
+
+ return mh_sha1_fail + murmur3_fail;
+}
+
+int main(int argc, char *argv[])
+{
+ int fail = 0, i;
+ uint32_t hash_test[SHA1_DIGEST_WORDS], hash_base[SHA1_DIGEST_WORDS];
+ uint32_t murmur3_test[MURMUR3_x64_128_DIGEST_WORDS],
+ murmur3_base[MURMUR3_x64_128_DIGEST_WORDS];
+ uint8_t *buff = NULL;
+ int update_count;
+ int size1, size2, offset, addr_offset;
+ struct mh_sha1_murmur3_x64_128_ctx *update_ctx = NULL;
+ uint8_t *mem_addr = NULL;
+
+ printf(" " xstr(TEST_UPDATE_FUNCTION) "_test:");
+
+ srand(TEST_SEED);
+
+ buff = malloc(TEST_LEN);
+ update_ctx = malloc(sizeof(*update_ctx));
+
+ if (buff == NULL || update_ctx == NULL) {
+ printf("malloc failed test aborted\n");
+ return -1;
+ }
+ // Rand test1
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base, murmur3_base);
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("fail rand1 test\n");
+ return -1;
+ } else
+ putchar('.');
+
+ // Test various size messages by update twice.
+ printf("\n various size messages by update twice tests");
+ for (size1 = TEST_LEN; size1 >= 0; size1--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+ murmur3_base);
+
+ // subsequent update
+ size2 = TEST_LEN - size1; // size2 is different with the former
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, size1));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + size1, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // Test various update count
+ printf("\n various update count tests");
+ for (update_count = 1; update_count <= TEST_LEN; update_count++) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+ murmur3_base);
+
+ // subsequent update
+ size1 = TEST_LEN / update_count;
+ size2 = TEST_LEN - size1 * (update_count - 1); // size2 is different with the former
+
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ for (i = 1, offset = 0; i < update_count; i++) {
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size1));
+ offset += size1;
+ }
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff + offset, size2));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail size1=%d\n", size1);
+ return -1;
+ }
+
+ if ((size2 & 0xff) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ // test various start address of ctx.
+ printf("\n various start address of ctx test");
+ free(update_ctx);
+ mem_addr = (uint8_t *) malloc(sizeof(*update_ctx) + AVX512_ALIGNED * 10);
+ for (addr_offset = AVX512_ALIGNED * 10; addr_offset >= 0; addr_offset--) {
+
+ // Fill with rand data
+ rand_buffer(buff, TEST_LEN);
+
+ mh_sha1_murmur3_x64_128_base(buff, TEST_LEN, TEST_SEED, hash_base,
+ murmur3_base);
+
+ // a unaligned offset
+ update_ctx = (struct mh_sha1_murmur3_x64_128_ctx *)(mem_addr + addr_offset);
+ CHECK_RETURN(mh_sha1_murmur3_x64_128_init(update_ctx, TEST_SEED));
+ CHECK_RETURN(TEST_UPDATE_FUNCTION(update_ctx, buff, TEST_LEN));
+ CHECK_RETURN(TEST_FINAL_FUNCTION(update_ctx, hash_test, murmur3_test));
+
+ fail = compare_digests(hash_base, hash_test, murmur3_base, murmur3_test);
+
+ if (fail) {
+ printf("Fail addr_offset=%d\n", addr_offset);
+ return -1;
+ }
+
+ if ((addr_offset & 0xf) == 0) {
+ putchar('.');
+ fflush(0);
+ }
+ }
+
+ printf("\n" xstr(TEST_UPDATE_FUNCTION) "_test: %s\n", fail == 0 ? "Pass" : "Fail");
+
+ return fail;
+
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c
new file mode 100644
index 000000000..75c3d90b5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128.c
@@ -0,0 +1,78 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdlib.h> // for NULL
+#include "murmur3_x64_128_internal.c"
+/*******************************************************************
+ * Single API which can calculate murmur3
+ ******************************************************************/
+/**
+ * @brief Get the digest of murmur3_x64_128 through a single API.
+ *
+ * Using murmur3_x64_128_block and murmur3_x64_128_tail.
+ * Used to test the murmur3_x64_128 digest.
+ *
+ * @param buffer Pointer to buffer to be processed
+ * @param len Length of buffer (in bytes) to be processed
+ * @param murmur_seed Seed as an initial digest of murmur3
+ * @param murmur3_x64_128_digest The digest of murmur3_x64_128
+ * @returns none
+ *
+ */
+void murmur3_x64_128(const void *buffer, uint32_t len, uint64_t murmur_seed,
+ uint32_t * murmur3_x64_128_digest)
+{
+ uint64_t *murmur3_x64_128_hash;
+ uint32_t murmur3_x64_128_hash_dword[4];
+ uint8_t *tail_buffer;
+ const uint8_t *input_data = (const uint8_t *)buffer;
+
+ // Initiate murmur3
+ murmur3_x64_128_hash = (uint64_t *) murmur3_x64_128_hash_dword;
+ murmur3_x64_128_hash[0] = murmur_seed;
+ murmur3_x64_128_hash[1] = murmur_seed;
+
+ // process bodies
+ murmur3_x64_128_block((uint8_t *) input_data, len / MUR_BLOCK_SIZE,
+ murmur3_x64_128_hash_dword);
+
+ // process finalize
+ tail_buffer = (uint8_t *) input_data + len - len % MUR_BLOCK_SIZE;
+ murmur3_x64_128_tail(tail_buffer, len, murmur3_x64_128_hash_dword);
+
+ // output the digests
+ if (murmur3_x64_128_digest != NULL) {
+ murmur3_x64_128_digest[0] = murmur3_x64_128_hash_dword[0];
+ murmur3_x64_128_digest[1] = murmur3_x64_128_hash_dword[1];
+ murmur3_x64_128_digest[2] = murmur3_x64_128_hash_dword[2];
+ murmur3_x64_128_digest[3] = murmur3_x64_128_hash_dword[3];
+ }
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c
new file mode 100644
index 000000000..6aab002ef
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/mh_sha1_murmur3_x64_128/murmur3_x64_128_internal.c
@@ -0,0 +1,138 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "mh_sha1_murmur3_x64_128_internal.h"
+#include <stdlib.h> // for NULL
+
+/* murmur3_x64_128 constants */
+// Shift bits of circle rotate
+#define MUR_SH1 31
+#define MUR_SH2 33
+#define MUR_SH3 27
+#define MUR_SH4 31
+#define MUR_SH5 33
+
+#define MUR_MUL 5
+#define MUR_ADD1 0x52dce729
+#define MUR_ADD2 0x38495ab5
+
+#define MUR_CON1 0x87c37b91114253d5LLU
+#define MUR_CON2 0x4cf5ad432745937fLLU
+
+#define MUR_FMUL1 0xff51afd7ed558ccdLLU
+#define MUR_FMUL2 0xc4ceb9fe1a85ec53LLU
+
+/* murmur3_x64_128 inline functions */
+static inline uint64_t blockmix64(uint64_t data, uint64_t conA, uint64_t conB, uint64_t shift)
+{
+ data *= conA;
+ data = (data << shift) | (data >> (64 - shift));
+ data *= conB;
+ return data;
+}
+
+static inline uint64_t hashmix64(uint64_t hashA, uint64_t hashB, uint64_t data, uint64_t add,
+ uint64_t shift)
+{
+ hashA ^= data;
+ hashA = (hashA << shift) | (hashA >> (64 - shift));
+ hashA += hashB;
+ hashA = hashA * MUR_MUL + add;
+ return hashA;
+}
+
+void murmur3_x64_128_block(const uint8_t * input_data, uint32_t num_blocks,
+ uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ uint64_t data1, data2;
+ uint64_t *input_qword = (uint64_t *) input_data;
+ uint64_t *hash = (uint64_t *) digests;
+ uint32_t i = 0;
+
+ while (i < num_blocks) {
+ data1 = input_qword[i * 2];
+ data2 = input_qword[i * 2 + 1];
+ data1 = blockmix64(data1, MUR_CON1, MUR_CON2, MUR_SH1);
+ data2 = blockmix64(data2, MUR_CON2, MUR_CON1, MUR_SH2);
+ hash[0] = hashmix64(hash[0], hash[1], data1, MUR_ADD1, MUR_SH3);
+ hash[1] = hashmix64(hash[1], hash[0], data2, MUR_ADD2, MUR_SH4);
+ i++;
+ }
+
+ return;
+}
+
+void murmur3_x64_128_tail(const uint8_t * tail_buffer, uint32_t total_len,
+ uint32_t digests[MURMUR3_x64_128_DIGEST_WORDS])
+{
+ uint64_t data1, data2;
+ uint64_t *hash = (uint64_t *) digests;
+ uint64_t tail_len = total_len % 16;
+ uint8_t *tail = (uint8_t *) tail_buffer;
+
+ union {
+ uint64_t hash[2];
+ uint8_t hashB[16];
+ } hashU;
+
+ // tail
+ hashU.hash[0] = hashU.hash[1] = 0;
+
+ while (tail_len-- > 0)
+ hashU.hashB[tail_len] = tail[tail_len];
+
+ data1 = hashU.hash[0];
+ data2 = hashU.hash[1];
+
+ data1 = blockmix64(data1, MUR_CON1, MUR_CON2, MUR_SH1);
+ data2 = blockmix64(data2, MUR_CON2, MUR_CON1, MUR_SH2);
+
+ hash[0] ^= total_len ^ data1;
+ hash[1] ^= total_len ^ data2;
+
+ hash[0] += hash[1];
+ hash[1] += hash[0];
+
+ hash[0] ^= hash[0] >> MUR_SH5;
+ hash[0] *= MUR_FMUL1;
+ hash[0] ^= hash[0] >> MUR_SH5;
+ hash[0] *= MUR_FMUL2;
+ hash[0] ^= hash[0] >> MUR_SH5;
+
+ hash[1] ^= hash[1] >> MUR_SH5;
+ hash[1] *= MUR_FMUL1;
+ hash[1] ^= hash[1] >> MUR_SH5;
+ hash[1] *= MUR_FMUL2;
+ hash[1] ^= hash[1] >> MUR_SH5;
+
+ hash[0] += hash[1];
+ hash[1] += hash[0];
+
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am
new file mode 100644
index 000000000..cefc3173f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/Makefile.am
@@ -0,0 +1,88 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc += sha1_mb/sha1_ctx_sse.c \
+ sha1_mb/sha1_ctx_avx.c \
+ sha1_mb/sha1_ctx_avx2.c
+
+lsrc += sha1_mb/sha1_mb_mgr_init_sse.c \
+ sha1_mb/sha1_mb_mgr_init_avx2.c
+
+lsrc += sha1_mb/sha1_mb_mgr_submit_sse.asm \
+ sha1_mb/sha1_mb_mgr_submit_avx.asm \
+ sha1_mb/sha1_mb_mgr_submit_avx2.asm \
+ sha1_mb/sha1_mb_mgr_flush_sse.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx2.asm \
+ sha1_mb/sha1_mb_x4_sse.asm \
+ sha1_mb/sha1_mb_x4_avx.asm \
+ sha1_mb/sha1_mb_x8_avx2.asm \
+ sha1_mb/sha1_multibinary.asm
+
+lsrc += sha1_mb/sha1_ctx_avx512.c \
+ sha1_mb/sha1_mb_mgr_init_avx512.c \
+ sha1_mb/sha1_mb_mgr_submit_avx512.asm \
+ sha1_mb/sha1_mb_mgr_flush_avx512.asm \
+ sha1_mb/sha1_mb_x16_avx512.asm
+
+extern_hdrs += include/sha1_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ include/multibinary.asm \
+ sha1_mb/sha1_job.asm \
+ sha1_mb/sha1_mb_mgr_datastruct.asm \
+ include/reg_sizes.asm \
+ sha1_mb/sha1_ref.c \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h
+
+check_tests += sha1_mb/sha1_mb_test \
+ sha1_mb/sha1_mb_rand_test \
+ sha1_mb/sha1_mb_rand_update_test
+
+unit_tests += sha1_mb/sha1_mb_rand_ssl_test
+
+perf_tests += sha1_mb/sha1_mb_vs_ossl_perf
+
+examples += sha1_mb/sha1_multi_buffer_example
+
+
+sha1_mb_rand_test: sha1_ref.o
+sha1_mb_sha1_mb_rand_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_rand_update_test: sha1_ref.o
+sha1_mb_sha1_mb_rand_update_test_LDADD = sha1_mb/sha1_ref.lo libisal_crypto.la
+
+sha1_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha1_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha1_mb_sha1_mb_vs_ossl_perf_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c
new file mode 100644
index 000000000..95da330bf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_fixedlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx_slver_02020142;
+struct slver sha1_ctx_mgr_init_avx_slver = { 0x0142, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_submit_avx_slver_02020143;
+struct slver sha1_ctx_mgr_submit_avx_slver = { 0x0143, 0x02, 0x02 };
+
+struct slver sha1_ctx_mgr_flush_avx_slver_02020144;
+struct slver sha1_ctx_mgr_flush_avx_slver = { 0x0144, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c
new file mode 100644
index 000000000..8ae26e30b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx2.c
@@ -0,0 +1,249 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx2(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx2(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx2(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx2(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx2_slver_04020145;
+struct slver sha1_ctx_mgr_init_avx2_slver = { 0x0145, 0x02, 0x04 };
+
+struct slver sha1_ctx_mgr_submit_avx2_slver_04020146;
+struct slver sha1_ctx_mgr_submit_avx2_slver = { 0x0146, 0x02, 0x04 };
+
+struct slver sha1_ctx_mgr_flush_avx2_slver_04020147;
+struct slver sha1_ctx_mgr_flush_avx2_slver = { 0x0147, 0x02, 0x04 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c
new file mode 100644
index 000000000..a47f422b2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_avx512.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_avx512(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_avx512(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_avx512(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_fixedlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx =
+ (SHA1_HASH_CTX *) sha1_mb_mgr_submit_avx512(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_avx512_slver_0600014a;
+struct slver sha1_ctx_mgr_init_avx512_slver = { 0x014a, 0x00, 0x06 };
+
+struct slver sha1_ctx_mgr_submit_avx512_slver_0600014b;
+struct slver sha1_ctx_mgr_submit_avx512_slver = { 0x014b, 0x00, 0x06 };
+
+struct slver sha1_ctx_mgr_flush_avx512_slver_0600014c;
+struct slver sha1_ctx_mgr_flush_avx512_slver = { 0x014c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c
new file mode 100644
index 000000000..a1c6e6443
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ctx_sse.c
@@ -0,0 +1,250 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA1_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx);
+
+void sha1_ctx_mgr_init_sse(SHA1_HASH_CTX_MGR * mgr)
+{
+ sha1_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_submit_sse(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx,
+ const void *buffer, uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA1_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA1_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA1_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA1_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha1_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA1_HASH_CTX *sha1_ctx_mgr_flush_sse(SHA1_HASH_CTX_MGR * mgr)
+{
+ SHA1_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha1_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha1_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA1_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA1_HASH_CTX *sha1_ctx_mgr_resubmit(SHA1_HASH_CTX_MGR * mgr, SHA1_HASH_CTX * ctx)
+{
+ while (ctx) {
+
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA1_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA1_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA1_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA1_HASH_CTX *) sha1_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA1_WORD_T * digest)
+{
+ static const SHA1_WORD_T hash_initial_digest[SHA1_DIGEST_NWORDS] =
+ { SHA1_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA1_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA1_BLOCK_SIZE - 1) & (0 - (total_len + SHA1_PADLENGTHFIELD_SIZE + 1))) + 1 +
+ SHA1_PADLENGTHFIELD_SIZE;
+
+#if SHA1_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA1_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha1_ctx_mgr_init_sse_slver_00020139;
+struct slver sha1_ctx_mgr_init_sse_slver = { 0x0139, 0x02, 0x00 };
+
+struct slver sha1_ctx_mgr_submit_sse_slver_00020140;
+struct slver sha1_ctx_mgr_submit_sse_slver = { 0x0140, 0x02, 0x00 };
+
+struct slver sha1_ctx_mgr_flush_sse_slver_00020141;
+struct slver sha1_ctx_mgr_flush_sse_slver = { 0x0141, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm
new file mode 100644
index 000000000..324ff0f54
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_job.asm
@@ -0,0 +1,55 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA1_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA1_JOB
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 4, 4 ; length in bytes
+FIELD _result_digest, 5*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+END_FIELDS
+
+%assign _SHA1_JOB_size _FIELD_OFFSET
+%assign _SHA1_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..7fa566bff
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_datastruct.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA1 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA1_ARGS_X16
+;;; name size align
+FIELD _digest, 4*5*16, 16 ; transposed digest
+FIELD _data_ptr, 8*16, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SHA1_ARGS_X4_size _FIELD_OFFSET
+%assign _SHA1_ARGS_X4_align _STRUCT_ALIGN
+%assign _SHA1_ARGS_X8_size _FIELD_OFFSET
+%assign _SHA1_ARGS_X8_align _STRUCT_ALIGN
+%assign _SHA1_ARGS_X16_size _FIELD_OFFSET
+%assign _SHA1_ARGS_X16_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SHA1_ARGS_X4_size, _SHA1_ARGS_X4_align
+FIELD _lens, 4*16, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..85e050d1c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx.asm
@@ -0,0 +1,224 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_avx
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*2
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha1_mb_mgr_flush_avx:function
+sha1_mb_mgr_flush_avx:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 16+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..8615bb233
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx2.asm
@@ -0,0 +1,251 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x8_avx2
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha1_mb_x8_avx2
+%define idx r8
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx2(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha1_mb_mgr_flush_avx2:function
+sha1_mb_mgr_flush_avx2:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; if bit (32+3) is set, then all lanes are empty
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*32]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..d86a1e817
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_avx512.asm
@@ -0,0 +1,250 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha1_mb_x16_avx512
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state arg1 ; unchanged
+%define job arg2 ; unused
+%define len2 arg2 ; unused
+
+; idx must be a register not clobberred by sha1_mb_x16_avx512
+%define idx r8 ; unchanged
+
+%define unused_lanes rbx ; covered
+%define lane_data rbx ; covered
+%define tmp2 rbx ; covered
+
+%define num_lanes_inuse r9 ; covered
+
+%define job_rax rax ; covered
+%define tmp rax ; unused
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_avx512(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha1_mb_mgr_flush_avx512:function
+sha1_mb_mgr_flush_avx512:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*64]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_mgr_flush_avx512
+no_sha1_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..6796425b9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_flush_sse.asm
@@ -0,0 +1,225 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_sse
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG1, ARG2, rax, r8-r11
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*2
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA1_JOB* sha1_mb_mgr_flush_sse(SHA1_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha1_mb_mgr_flush_sse:function
+sha1_mb_mgr_flush_sse:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 16+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp2), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp2)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..69360d00f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx2.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_avx2(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210;
+ for (j = 0; j < SHA1_X8_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..7c8849b47
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_avx512.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_avx512(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA1_MAX_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c
new file mode 100644
index 000000000..f8985b93a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_init_sse.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha1_mb.h"
+
+void sha1_mb_mgr_init_sse(SHA1_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF3210;
+ for (j = 0; j < SHA1_MIN_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..b18a4369d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx.asm
@@ -0,0 +1,238 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_avx
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%else
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+; idx needs to be in a register not clobberred by sha1_mult
+%define last_len rdx
+%define idx rdx
+
+%define size_offset rcx
+%define tmp2 rcx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*4 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_avx(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha1_mb_mgr_submit_avx:function
+sha1_mb_mgr_submit_avx:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*4 + 16*0], xmm6
+ vmovdqa [rsp + 8*4 + 16*1], xmm7
+ vmovdqa [rsp + 8*4 + 16*2], xmm8
+ vmovdqa [rsp + 8*4 + 16*3], xmm9
+ vmovdqa [rsp + 8*4 + 16*4], xmm10
+ vmovdqa [rsp + 8*4 + 16*5], xmm11
+ vmovdqa [rsp + 8*4 + 16*6], xmm12
+ vmovdqa [rsp + 8*4 + 16*7], xmm13
+ vmovdqa [rsp + 8*4 + 16*8], xmm14
+ vmovdqa [rsp + 8*4 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*4 + 16*0]
+ vmovdqa xmm7, [rsp + 8*4 + 16*1]
+ vmovdqa xmm8, [rsp + 8*4 + 16*2]
+ vmovdqa xmm9, [rsp + 8*4 + 16*3]
+ vmovdqa xmm10, [rsp + 8*4 + 16*4]
+ vmovdqa xmm11, [rsp + 8*4 + 16*5]
+ vmovdqa xmm12, [rsp + 8*4 + 16*6]
+ vmovdqa xmm13, [rsp + 8*4 + 16*7]
+ vmovdqa xmm14, [rsp + 8*4 + 16*8]
+ vmovdqa xmm15, [rsp + 8*4 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+H0: dd 0x67452301
+H1: dd 0xefcdab89
+H2: dd 0x98badcfe
+H3: dd 0x10325476
+H4: dd 0xc3d2e1f0
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..f21ecf5f0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx2.asm
@@ -0,0 +1,242 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "memcpy.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x8_avx2
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%define extra_blocks rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%define extra_blocks rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+; idx must be a register not clobberred by sha1_x8_avx2
+%define idx r8
+%define last_len r8
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* sha1_mb_mgr_submit_avx2(MB_MGR *state, JOB_SHA1 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha1_mb_mgr_submit_avx2:function
+sha1_mb_mgr_submit_avx2:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+
+ shl len,4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+
+ vmovd [state + _args_digest + 4*lane + 0*32], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*32], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*32], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*32], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*32], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x8_avx2
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*32]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*32], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*32], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*32], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*32]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..30e8b25cf
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_avx512.asm
@@ -0,0 +1,243 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "memcpy.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha1_mb_x16_avx512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%else
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common definitions and latter-state(unused,covered,unchanged)
+%define state arg1 ; unchanged, mb_x16's input1
+%define job arg2 ; arg2 unused
+%define len2 arg2 ; arg2 unused, mb_x16's input2
+
+; idx must be a register not clobberred by sha1_x16_avx512
+%define idx r8 ; unchanged
+
+%define p r11 ; unused
+
+%define unused_lanes rbx ; covered
+
+%define job_rax rax ; covered
+%define len rax ; unused
+
+%define lane rbp ; unused
+
+%define tmp r9 ; covered
+%define num_lanes_inuse r9 ; covered
+
+%define lane_data r10 ; covered
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; JOB* sha1_mb_mgr_submit_avx512(MB_MGR *state, JOB_SHA1 *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha1_mb_mgr_submit_avx512:function
+sha1_mb_mgr_submit_avx512:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+
+ shl len,4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+
+ vmovd [state + _args_digest + 4*lane + 0*64], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*64], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*64], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*64], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*64], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF ; idx represent min length index
+ shr len2, 4 ; size in blocks
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x16_avx512
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*64]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*64], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*64], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*64], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*64]
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_mgr_submit_avx512
+no_sha1_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..e780006df
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_mgr_submit_sse.asm
@@ -0,0 +1,238 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_job.asm"
+%include "sha1_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha1_mb_x4_sse
+
+%ifidn __OUTPUT_FORMAT__, win64
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%else
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+
+; idx needs to be other than ARG2, rax, r8-r11
+%define last_len rdx
+%define idx rdx
+
+%define size_offset rcx
+%define tmp2 rcx
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*4 + 16*10 + 8
+
+; SHA1_JOB* sha1_mb_mgr_submit_sse(SHA1_MB_JOB_MGR *state, SHA1_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha1_mb_mgr_submit_sse:function
+sha1_mb_mgr_submit_sse:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ movdqa [rsp + 8*4 + 16*0], xmm6
+ movdqa [rsp + 8*4 + 16*1], xmm7
+ movdqa [rsp + 8*4 + 16*2], xmm8
+ movdqa [rsp + 8*4 + 16*3], xmm9
+ movdqa [rsp + 8*4 + 16*4], xmm10
+ movdqa [rsp + 8*4 + 16*5], xmm11
+ movdqa [rsp + 8*4 + 16*6], xmm12
+ movdqa [rsp + 8*4 + 16*7], xmm13
+ movdqa [rsp + 8*4 + 16*8], xmm14
+ movdqa [rsp + 8*4 + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ mov DWORD(tmp), [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ mov [state + _args_digest + 4*lane + 4*16], DWORD(tmp)
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha1_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ mov DWORD(tmp), [state + _args_digest + 4*idx + 4*16]
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ mov [job_rax + _result_digest + 1*16], DWORD(tmp)
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 8*4 + 16*0]
+ movdqa xmm7, [rsp + 8*4 + 16*1]
+ movdqa xmm8, [rsp + 8*4 + 16*2]
+ movdqa xmm9, [rsp + 8*4 + 16*3]
+ movdqa xmm10, [rsp + 8*4 + 16*4]
+ movdqa xmm11, [rsp + 8*4 + 16*5]
+ movdqa xmm12, [rsp + 8*4 + 16*6]
+ movdqa xmm13, [rsp + 8*4 + 16*7]
+ movdqa xmm14, [rsp + 8*4 + 16*8]
+ movdqa xmm15, [rsp + 8*4 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+H0: dd 0x67452301
+H1: dd 0xefcdab89
+H2: dd 0x98badcfe
+H3: dd 0x10325476
+H4: dd 0xc3d2e1f0
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c
new file mode 100644
index 000000000..f4319cd96
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_ssl_test.c
@@ -0,0 +1,157 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+unsigned int byteswap(unsigned int x)
+{
+ return (x >> 24) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24);
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+
+ printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ byteswap(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ byteswap(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Ramdom buffer with ramdom len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ SHA1(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ byteswap(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ byteswap(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c
new file mode 100644
index 000000000..f818b39eb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_test.c
@@ -0,0 +1,196 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+
+ printf("multibinary_sha1 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS, TEST_LEN);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ sha1_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sha1_mb test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sha1_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_sha1 test
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c
new file mode 100644
index 000000000..53358b810
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_rand_update_test.c
@@ -0,0 +1,291 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha1_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SHA1_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA1_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][SHA1_DIGEST_NWORDS];
+
+extern void sha1_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+
+ printf("multibinary_sha1_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha1_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_sha1 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha1_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha1_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i], buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha1_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sha1_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ sha1_ctx_mgr_init(mgr);
+
+ // Run sha1_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SHA1_BLOCK_SIZE +
+ SHA1_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SHA1_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem, HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha1_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha1_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SHA1_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha1_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c
new file mode 100644
index 000000000..ee585367c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_test.c
@@ -0,0 +1,226 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha1_mb.h"
+
+typedef uint32_t DigestSHA1[SHA1_DIGEST_NWORDS];
+
+#define MSGS 7
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+static DigestSHA1 expResultDigest1 =
+ { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 };
+
+static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+static DigestSHA1 expResultDigest2 =
+ { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D };
+
+static uint8_t msg3[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<";
+static DigestSHA1 expResultDigest3 =
+ { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 };
+
+static uint8_t msg4[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+static DigestSHA1 expResultDigest4 =
+ { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 };
+
+static uint8_t msg5[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?";
+static DigestSHA1 expResultDigest5 =
+ { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 };
+
+static uint8_t msg6[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+static DigestSHA1 expResultDigest6 =
+ { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 };
+
+static uint8_t msg7[] = "";
+static DigestSHA1 expResultDigest7 =
+ { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 };
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+
+static uint32_t *expResultDigest[MSGS] = {
+ expResultDigest1, expResultDigest2, expResultDigest3,
+ expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7
+};
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ sha1_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i], msgs[i],
+ strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sha1_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sha1_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sha1_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sha1 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..558e82884
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_vs_ossl_perf.c
@@ -0,0 +1,129 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 10000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 100
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA1_DIGEST_NWORDS];
+
+inline unsigned int byteswap32(unsigned int x)
+{
+ return (x >> 24) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24);
+}
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t) TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ sha1_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA1(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha1_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sha1_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sha1_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha1" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA1_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ byteswap32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ byteswap32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sha1 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha1_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm
new file mode 100644
index 000000000..e76409722
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x16_avx512.asm
@@ -0,0 +1,559 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+default rel
+
+;; code to compute oct SHA1 using AVX-512
+;; outer calling routine takes care of save and restore of XMM registers
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) global func_name:function internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+
+%define IDX var1
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define KT zmm5
+%define AA zmm6
+%define BB zmm7
+%define CC zmm8
+%define DD zmm9
+%define EE zmm10
+%define TMP0 zmm11
+%define TMP1 zmm12
+%define TMP2 zmm13
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
+
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+ vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%F_IMMED %2
+
+ ; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
+ ; E=D, D=C, C=ROTL_30(B), B=A, A=T
+
+ ; Ft
+ ; 0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
+ ; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
+ ; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
+
+ vmovdqa32 TMP1, B ; Copy B
+ vpaddd E, E, %%WT ; E = E + Wt
+ vpternlogd TMP1, C, D, %%F_IMMED ; TMP1 = Ft(B,C,D)
+ vpaddd E, E, KT ; E = E + Wt + Kt
+ vprold TMP0, A, 5 ; TMP0 = ROTL_5(A)
+ vpaddd E, E, TMP1 ; E = Ft(B,C,D) + E + Kt + Wt
+ vprold B, B, 30 ; B = ROTL_30(B)
+ vpaddd E, E, TMP0 ; E = T
+
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp2 %2
+%define %%WTp8 %3
+%define %%WTp13 %4
+ ; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
+ ; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
+ vpternlogd %%WT, %%WTp2, %%WTp8, 0x96
+ vpxord %%WT, %%WT, %%WTp13
+ vprold %%WT, %%WT, 1
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT %1
+%define %%OFFSET %2
+ mov inp0, [IN + (%%OFFSET*8)]
+ vmovups %%WT, [inp0+IDX]
+%endmacro
+
+align 64
+
+; void sha1_mb_x16_avx512(SHA1_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha1_mb_x16_avx512)
+sha1_mb_x16_avx512:
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*64]
+ vmovups B, [DIGEST + 1*64]
+ vmovups C, [DIGEST + 2*64]
+ vmovups D, [DIGEST + 3*64]
+ vmovups E, [DIGEST + 4*64]
+
+ xor IDX, IDX
+
+ ;; transpose input onto stack
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+ vmovups W0,[inp0+IDX]
+ vmovups W1,[inp1+IDX]
+ vmovups W2,[inp2+IDX]
+ vmovups W3,[inp3+IDX]
+ vmovups W4,[inp4+IDX]
+ vmovups W5,[inp5+IDX]
+ vmovups W6,[inp6+IDX]
+ vmovups W7,[inp7+IDX]
+
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+
+ vmovups W8, [inp0+IDX]
+ vmovups W9, [inp1+IDX]
+ vmovups W10,[inp2+IDX]
+ vmovups W11,[inp3+IDX]
+ vmovups W12,[inp4+IDX]
+ vmovups W13,[inp5+IDX]
+ vmovups W14,[inp6+IDX]
+ vmovups W15,[inp7+IDX]
+
+lloop:
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ add IDX, 64
+
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; Save digests for later addition
+ vmovdqa32 AA, A
+ vmovdqa32 BB, B
+ vmovdqa32 CC, C
+ vmovdqa32 DD, D
+ vmovdqa32 EE, E
+
+ vmovdqa32 KT, [K00_19]
+%assign I 0xCA
+%assign J 0
+%assign K 2
+%assign L 8
+%assign M 13
+%assign N 0
+%rep 64
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+ %if N = 19
+ vmovdqa32 KT, [K20_39]
+ %assign I 0x96
+ %elif N = 39
+ vmovdqa32 KT, [K40_59]
+ %assign I 0xE8
+ %elif N = 59
+ vmovdqa32 KT, [K60_79]
+ %assign I 0x96
+ %endif
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%assign N (N+1)
+%endrep
+
+ ; Check if this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ jmp lloop
+
+lastLoop:
+; Need to reset argument rotation values to Round 64 values
+%xdefine TMP_ A
+%xdefine A B
+%xdefine B C
+%xdefine C D
+%xdefine D E
+%xdefine E TMP_
+
+ ; Process last 16 rounds
+%assign I 0x96
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovups [DIGEST + 0*64], A
+ vmovups [DIGEST + 1*64], B
+ vmovups [DIGEST + 2*64], C
+ vmovups [DIGEST + 3*64], D
+ vmovups [DIGEST + 4*64], E
+
+ ret
+
+section .data
+align 64
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha1_mb_x16_avx512
+no_sha1_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm
new file mode 100644
index 000000000..f01ee7f61
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_avx.asm
@@ -0,0 +1,413 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+default rel
+
+;; code to compute quad SHA1 using AVX
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm : clobbers all xmm regs, rcx left intact
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-(%%imm))
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PROLD_nd reg, imm, tmp, src
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-(%%imm))
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 16)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 16]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 16]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define XMM_SAVE ((15-15)*16 + 1*8)
+%define FRAMESZ 16*16 + XMM_SAVE
+%define _XMM FRAMESZ - XMM_SAVE
+
+%define VMOVPS vmovups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX rax
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (4*5*4)
+
+;%ifdef LINUX
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define ARG1 rdi
+ %define ARG2 rsi
+%else
+ ; Windows
+ %define ARG1 rcx
+ %define ARG2 rdx
+%endif
+
+align 32
+
+; void sha1_mb_x4_avx(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
+; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used)
+; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
+;
+; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
+;
+global sha1_mb_x4_avx:function internal
+sha1_mb_x4_avx:
+
+ sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8
+
+ ;; Initialize digests
+ vmovdqa A, [ARG1 + 0*16]
+ vmovdqa B, [ARG1 + 1*16]
+ vmovdqa C, [ARG1 + 2*16]
+ vmovdqa D, [ARG1 + 3*16]
+ vmovdqa E, [ARG1 + 4*16]
+
+ ;; load input pointers
+ mov inp0,[ARG1 + _data_ptr + 0*8]
+ mov inp1,[ARG1 + _data_ptr + 1*8]
+ mov inp2,[ARG1 + _data_ptr + 2*8]
+ mov inp3,[ARG1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ vmovdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+ VMOVPS T2,[inp0+IDX]
+ VMOVPS T1,[inp1+IDX]
+ VMOVPS T4,[inp2+IDX]
+ VMOVPS T3,[inp3+IDX]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ vpshufb T0, T0, F
+ vmovdqa [rsp+(I*4+0)*16],T0
+ vpshufb T1, T1, F
+ vmovdqa [rsp+(I*4+1)*16],T1
+ vpshufb T2, T2, F
+ vmovdqa [rsp+(I*4+2)*16],T2
+ vpshufb T3, T3, F
+ vmovdqa [rsp+(I*4+3)*16],T3
+ add IDX, 4*4
+%assign I (I+1)
+%endrep
+
+ ; save old digests
+ vmovdqa AA, A
+ vmovdqa BB, B
+ vmovdqa CC, C
+ vmovdqa DD, D
+ vmovdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqa K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqa W16, [rsp + ((16 - 16) & 15) * 16]
+ vmovdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqa K, [K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqa K, [K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqa K, [K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub ARG2, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqa [ARG1 + 0*16], A
+ vmovdqa [ARG1 + 1*16], B
+ vmovdqa [ARG1 + 2*16], C
+ vmovdqa [ARG1 + 3*16], D
+ vmovdqa [ARG1 + 4*16], E
+
+ ; update input pointers
+ add inp0, IDX
+ mov [ARG1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [ARG1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [ARG1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [ARG1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+
+ ret
+
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm
new file mode 100644
index 000000000..f8619f232
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x4_sse.asm
@@ -0,0 +1,410 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+default rel
+
+;; code to compute quad SHA1 using SSE
+;; derived from ...\sha1_multiple\sha1_quad4.asm
+;; variation of sha1_mult2.asm
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+; macro MAGIC_F0 F,B,C,D,T ;; F = (D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regC
+ pxor %%regF,%%regD
+ pand %%regF,%%regB
+ pxor %%regF,%%regD
+%endmacro
+
+; macro MAGIC_F1 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regD
+ pxor %%regF,%%regC
+ pxor %%regF,%%regB
+%endmacro
+
+; macro MAGIC_F2 F,B,C,D,T ;; F = ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ movdqa %%regF,%%regB
+ movdqa %%regT,%%regB
+ por %%regF,%%regC
+ pand %%regT,%%regC
+ pand %%regF,%%regD
+ por %%regF,%%regT
+%endmacro
+
+; macro MAGIC_F3 F,B,C,D,T ;; F = (B ^ C ^ D)
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ pslld %%reg, %%imm
+ psrld %%tmp, (32-%%imm)
+ por %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ paddd %%regE,%%immCNT
+ paddd %%regE,[rsp + (%%memW * 16)]
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ paddd %%regE,%%immCNT
+ movdqa W14, [rsp + ((%%memW - 14) & 15) * 16]
+ pxor W16, W14
+ pxor W16, [rsp + ((%%memW - 8) & 15) * 16]
+ pxor W16, [rsp + ((%%memW - 3) & 15) * 16]
+ movdqa %%regF, W16
+ pslld W16, 1
+ psrld %%regF, (32-1)
+ por %%regF, W16
+ ROTATE_W
+
+ movdqa [rsp + ((%%memW - 0) & 15) * 16],%%regF
+ paddd %%regE,%%regF
+ movdqa %%regT,%%regA
+ PROLD %%regT,5, %%regF
+ paddd %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ paddd %%regE,%%regF
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define XMM_SAVE ((15-15)*16 + 1*8)
+%define FRAMESZ 16*16 + XMM_SAVE
+%define _XMM FRAMESZ - XMM_SAVE
+
+%define MOVPS movups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%define IDX rax
+
+%define A xmm0
+%define B xmm1
+%define C xmm2
+%define D xmm3
+%define E xmm4
+%define F xmm5 ; tmp
+%define G xmm6 ; tmp
+
+%define TMP G
+%define FUN F
+%define K xmm7
+
+%define AA xmm8
+%define BB xmm9
+%define CC xmm10
+%define DD xmm11
+%define EE xmm12
+
+%define T0 xmm6
+%define T1 xmm7
+%define T2 xmm8
+%define T3 xmm9
+%define T4 xmm10
+%define T5 xmm11
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%define W14 xmm13
+%define W15 xmm14
+%define W16 xmm15
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (4*5*4)
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux
+ %define ARG1 rdi
+ %define ARG2 rsi
+%else
+ ; Windows
+ %define ARG1 rcx
+ %define ARG2 rdx
+%endif
+
+align 32
+
+; void sha1_mb_x4_sse(SHA1_MB_ARGS_X8 *args, uint32_t size_in_blocks);
+; arg 1 : ARG1 : pointer to args (only 4 of the 8 lanes used)
+; arg 2 : ARG2 : size (in blocks) ;; assumed to be >= 1
+;
+; Clobbers registers: ARG2, rax, r8-r11, xmm0-xmm15
+;
+global sha1_mb_x4_sse:function internal
+sha1_mb_x4_sse:
+
+ sub rsp, FRAMESZ ;; FRAMESZ + pushes must be odd multiple of 8
+
+ ;; Initialize digests
+ movdqa A, [ARG1 + 0*16]
+ movdqa B, [ARG1 + 1*16]
+ movdqa C, [ARG1 + 2*16]
+ movdqa D, [ARG1 + 3*16]
+ movdqa E, [ARG1 + 4*16]
+
+ ;; load input pointers
+ mov inp0,[ARG1 + _data_ptr + 0*8]
+ mov inp1,[ARG1 + _data_ptr + 1*8]
+ mov inp2,[ARG1 + _data_ptr + 2*8]
+ mov inp3,[ARG1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ movdqa F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 4
+ MOVPS T2,[inp0+IDX]
+ MOVPS T1,[inp1+IDX]
+ MOVPS T4,[inp2+IDX]
+ MOVPS T3,[inp3+IDX]
+ TRANSPOSE T2, T1, T4, T3, T0, T5
+ pshufb T0, F
+ movdqa [rsp+(I*4+0)*16],T0
+ pshufb T1, F
+ movdqa [rsp+(I*4+1)*16],T1
+ pshufb T2, F
+ movdqa [rsp+(I*4+2)*16],T2
+ pshufb T3, F
+ movdqa [rsp+(I*4+3)*16],T3
+ add IDX, 4*4
+%assign I (I+1)
+%endrep
+
+ ; save old digests
+ movdqa AA, A
+ movdqa BB, B
+ movdqa CC, C
+ movdqa DD, D
+ movdqa EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ movdqa K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ movdqa W16, [rsp + ((16 - 16) & 15) * 16]
+ movdqa W15, [rsp + ((16 - 15) & 15) * 16]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ movdqa K, [K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ movdqa K, [K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ movdqa K, [K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ paddd A,AA
+ paddd B,BB
+ paddd C,CC
+ paddd D,DD
+ paddd E,EE
+
+ sub ARG2, 1
+ jne lloop
+
+ ; write out digests
+ movdqa [ARG1 + 0*16], A
+ movdqa [ARG1 + 1*16], B
+ movdqa [ARG1 + 2*16], C
+ movdqa [ARG1 + 3*16], D
+ movdqa [ARG1 + 4*16], E
+
+ ; update input pointers
+ add inp0, IDX
+ mov [ARG1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [ARG1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [ARG1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [ARG1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+
+ ret
+
+
+section .data align=16
+
+align 16
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm
new file mode 100644
index 000000000..418991db6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_mb_x8_avx2.asm
@@ -0,0 +1,515 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha1_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+default rel
+
+;; code to compute oct SHA1 using SSE-256
+;; outer calling routine takes care of save and restore of XMM registers
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+;;
+;; Magic functions defined in FIPS 180-1
+;;
+;MAGIC_F0 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((D ^ (B & (C ^ D)))
+%macro MAGIC_F0 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF, %%regC,%%regD
+ vpand %%regF, %%regF,%%regB
+ vpxor %%regF, %%regF,%%regD
+%endmacro
+
+;MAGIC_F1 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; (B ^ C ^ D)
+%macro MAGIC_F1 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpxor %%regF,%%regD,%%regC
+ vpxor %%regF,%%regF,%%regB
+%endmacro
+
+
+
+;MAGIC_F2 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ ;; ((B & C) | (B & D) | (C & D))
+%macro MAGIC_F2 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ vpor %%regF,%%regB,%%regC
+ vpand %%regT,%%regB,%%regC
+ vpand %%regF,%%regF,%%regD
+ vpor %%regF,%%regF,%%regT
+%endmacro
+
+;MAGIC_F3 MACRO regF:REQ,regB:REQ,regC:REQ,regD:REQ,regT:REQ
+%macro MAGIC_F3 5
+%define %%regF %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regT %5
+ MAGIC_F1 %%regF,%%regB,%%regC,%%regD,%%regT
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsrld %%tmp, %%reg, (32-%%imm)
+ vpslld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PROLD reg, imm, tmp
+%macro PROLD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsrld %%tmp, %%src, (32-%%imm)
+ vpslld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+%macro SHA1_STEP_00_15 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+ vpaddd %%regE, %%regE,[rsp + (%%memW * 32)]
+ PROLD_nd %%regT,5, %%regF,%%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE, %%regE,%%regF
+%endmacro
+
+%macro SHA1_STEP_16_79 10
+%define %%regA %1
+%define %%regB %2
+%define %%regC %3
+%define %%regD %4
+%define %%regE %5
+%define %%regT %6
+%define %%regF %7
+%define %%memW %8
+%define %%immCNT %9
+%define %%MAGIC %10
+ vpaddd %%regE, %%regE,%%immCNT
+
+ vmovdqu W14, [rsp + ((%%memW - 14) & 15) * 32]
+ vpxor W16, W16, W14
+ vpxor W16, W16, [rsp + ((%%memW - 8) & 15) * 32]
+ vpxor W16, W16, [rsp + ((%%memW - 3) & 15) * 32]
+
+ vpsrld %%regF, W16, (32-1)
+ vpslld W16, W16, 1
+ vpor %%regF, %%regF, W16
+ ROTATE_W
+
+ vmovdqu [rsp + ((%%memW - 0) & 15) * 32],%%regF
+ vpaddd %%regE, %%regE,%%regF
+
+ PROLD_nd %%regT,5, %%regF, %%regA
+ vpaddd %%regE, %%regE,%%regT
+ %%MAGIC %%regF,%%regB,%%regC,%%regD,%%regT ;; FUN = MAGIC_Fi(B,C,D)
+ PROLD %%regB,30, %%regT
+ vpaddd %%regE,%%regE,%%regF
+%endmacro
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; FRAMESZ plus pushes must be an odd multiple of 8
+%define YMM_SAVE (15-15)*32
+%define FRAMESZ 32*16 + 0*8 + YMM_SAVE
+%define _YMM FRAMESZ - YMM_SAVE
+
+%define VMOVPS vmovups
+
+%define IDX rax
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ %define inp7 rcx
+ %define arg1 rdi
+ %define arg2 rsi
+ %define RSP_SAVE rdx
+%else
+ %define inp7 rdi
+ %define arg1 rcx
+ %define arg2 rdx
+ %define RSP_SAVE rsi
+%endif
+
+
+; ymm0 A
+; ymm1 B
+; ymm2 C
+; ymm3 D
+; ymm4 E
+; ymm5 F AA
+; ymm6 T0 BB
+; ymm7 T1 CC
+; ymm8 T2 DD
+; ymm9 T3 EE
+; ymm10 T4 TMP
+; ymm11 T5 FUN
+; ymm12 T6 K
+; ymm13 T7 W14
+; ymm14 T8 W15
+; ymm15 T9 W16
+
+%define A ymm0
+%define B ymm1
+%define C ymm2
+%define D ymm3
+%define E ymm4
+
+%define F ymm5
+%define T0 ymm6
+%define T1 ymm7
+%define T2 ymm8
+%define T3 ymm9
+%define T4 ymm10
+%define T5 ymm11
+%define T6 ymm12
+%define T7 ymm13
+%define T8 ymm14
+%define T9 ymm15
+
+%define AA ymm5
+%define BB ymm6
+%define CC ymm7
+%define DD ymm8
+%define EE ymm9
+%define TMP ymm10
+%define FUN ymm11
+%define K ymm12
+%define W14 ymm13
+%define W15 ymm14
+%define W16 ymm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+%macro ROTATE_W 0
+%xdefine TMP_ W16
+%xdefine W16 W15
+%xdefine W15 W14
+%xdefine W14 TMP_
+%endm
+
+%define DIGEST_SIZE (8*5*4) ; 8 streams x 5 32bit words per digest x 4 bytes per word
+
+align 32
+
+; void sha1_x8_avx2(SHA1_MB_ARGS_X8, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+global sha1_mb_x8_avx2:function internal
+sha1_mb_x8_avx2:
+
+ push RSP_SAVE
+
+ ; save rsp
+ mov RSP_SAVE, rsp
+ sub rsp, FRAMESZ ;; FRAMESZ + pushes must be even multiple of 8
+
+ ; align rsp to 32 Bytes
+ and rsp, ~0x1F
+
+ ;; Initialize digests
+ vmovdqu A, [arg1 + 0*32]
+ vmovdqu B, [arg1 + 1*32]
+ vmovdqu C, [arg1 + 2*32]
+ vmovdqu D, [arg1 + 3*32]
+ vmovdqu E, [arg1 + 4*32]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1+_data_ptr+0*8]
+ mov inp1,[arg1+_data_ptr+1*8]
+ mov inp2,[arg1+_data_ptr+2*8]
+ mov inp3,[arg1+_data_ptr+3*8]
+ mov inp4,[arg1+_data_ptr+4*8]
+ mov inp5,[arg1+_data_ptr+5*8]
+ mov inp6,[arg1+_data_ptr+6*8]
+ mov inp7,[arg1+_data_ptr+7*8]
+
+ xor IDX, IDX
+lloop:
+ vmovdqu F, [PSHUFFLE_BYTE_FLIP_MASK]
+%assign I 0
+%rep 2
+ VMOVPS T0,[inp0+IDX]
+ VMOVPS T1,[inp1+IDX]
+ VMOVPS T2,[inp2+IDX]
+ VMOVPS T3,[inp3+IDX]
+ VMOVPS T4,[inp4+IDX]
+ VMOVPS T5,[inp5+IDX]
+ VMOVPS T6,[inp6+IDX]
+ VMOVPS T7,[inp7+IDX]
+ TRANSPOSE8 T0, T1, T2, T3, T4, T5, T6, T7, T8, T9
+ vpshufb T0, T0, F
+ vmovdqu [rsp+(I*8+0)*32],T0
+ vpshufb T1, T1, F
+ vmovdqu [rsp+(I*8+1)*32],T1
+ vpshufb T2, T2, F
+ vmovdqu [rsp+(I*8+2)*32],T2
+ vpshufb T3, T3, F
+ vmovdqu [rsp+(I*8+3)*32],T3
+ vpshufb T4, T4, F
+ vmovdqu [rsp+(I*8+4)*32],T4
+ vpshufb T5, T5, F
+ vmovdqu [rsp+(I*8+5)*32],T5
+ vpshufb T6, T6, F
+ vmovdqu [rsp+(I*8+6)*32],T6
+ vpshufb T7, T7, F
+ vmovdqu [rsp+(I*8+7)*32],T7
+ add IDX, 32
+%assign I (I+1)
+%endrep
+
+
+ ; save old digests
+ vmovdqu AA, A
+ vmovdqu BB, B
+ vmovdqu CC, C
+ vmovdqu DD, D
+ vmovdqu EE, E
+
+;;
+;; perform 0-79 steps
+;;
+ vmovdqu K, [K00_19]
+;; do rounds 0...15
+%assign I 0
+%rep 16
+ SHA1_STEP_00_15 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 16...19
+ vmovdqu W16, [rsp + ((16 - 16) & 15) * 32]
+ vmovdqu W15, [rsp + ((16 - 15) & 15) * 32]
+%rep 4
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F0
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 20...39
+ vmovdqu K, [K20_39]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F1
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 40...59
+ vmovdqu K, [K40_59]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F2
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+;; do rounds 60...79
+ vmovdqu K, [K60_79]
+%rep 20
+ SHA1_STEP_16_79 A,B,C,D,E, TMP,FUN, I, K, MAGIC_F3
+ ROTATE_ARGS
+%assign I (I+1)
+%endrep
+
+ vpaddd A,A,AA
+ vpaddd B,B,BB
+ vpaddd C,C,CC
+ vpaddd D,D,DD
+ vpaddd E,E,EE
+
+ sub arg2, 1
+ jne lloop
+
+ ; write out digests
+ vmovdqu [arg1 + 0*32], A
+ vmovdqu [arg1 + 1*32], B
+ vmovdqu [arg1 + 2*32], C
+ vmovdqu [arg1 + 3*32], D
+ vmovdqu [arg1 + 4*32], E
+
+ ;; update input pointers
+ add inp0, IDX
+ add inp1, IDX
+ add inp2, IDX
+ add inp3, IDX
+ add inp4, IDX
+ add inp5, IDX
+ add inp6, IDX
+ add inp7, IDX
+ mov [arg1+_data_ptr+0*8], inp0
+ mov [arg1+_data_ptr+1*8], inp1
+ mov [arg1+_data_ptr+2*8], inp2
+ mov [arg1+_data_ptr+3*8], inp3
+ mov [arg1+_data_ptr+4*8], inp4
+ mov [arg1+_data_ptr+5*8], inp5
+ mov [arg1+_data_ptr+6*8], inp6
+ mov [arg1+_data_ptr+7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ mov rsp, RSP_SAVE
+
+ pop RSP_SAVE
+ ret
+
+
+
+section .data align=32
+
+align 32
+K00_19: dq 0x5A8279995A827999, 0x5A8279995A827999
+ dq 0x5A8279995A827999, 0x5A8279995A827999
+K20_39: dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+ dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
+K40_59: dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+ dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
+K60_79: dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+ dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c
new file mode 100644
index 000000000..502e10b92
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multi_buffer_example.c
@@ -0,0 +1,104 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha1_mb.h"
+#include "test.h"
+
+// Test messages
+#define TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+uint8_t msg3[] = TST_STR TST_STR "0123456789:;<";
+uint8_t msg4[] = TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+uint8_t msg5[] = TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?";
+uint8_t msg6[] =
+ TST_STR TST_STR TST_STR TST_STR TST_STR TST_STR "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+uint8_t msg7[] = "";
+
+// Expected digests
+uint32_t dgst1[] = { 0x84983E44, 0x1C3BD26E, 0xBAAE4AA1, 0xF95129E5, 0xE54670F1 };
+uint32_t dgst2[] = { 0xB7C66452, 0x0FD122B3, 0x55D539F2, 0xA35E6FAA, 0xC2A5A11D };
+uint32_t dgst3[] = { 0x127729B6, 0xA8B2F8A0, 0xA4DDC819, 0x08E1D8B3, 0x67CEEA55 };
+uint32_t dgst4[] = { 0xFDDE2D00, 0xABD5B7A3, 0x699DE6F2, 0x3FF1D1AC, 0x3B872AC2 };
+uint32_t dgst5[] = { 0xE7FCA85C, 0xA4AB3740, 0x6A180B32, 0x0B8D362C, 0x622A96E6 };
+uint32_t dgst6[] = { 0x505B0686, 0xE1ACDF42, 0xB3588B5A, 0xB043D52C, 0x6D8C7444 };
+uint32_t dgst7[] = { 0xDA39A3EE, 0x5E6B4B0D, 0x3255BFEF, 0x95601890, 0xAFD80709 };
+
+uint8_t *msgs[] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+uint32_t *expected_digest[] = { dgst1, dgst2, dgst3, dgst4, dgst5, dgst6, dgst7 };
+
+int check_job(uint32_t * ref, uint32_t * good, int words)
+{
+ int i;
+ for (i = 0; i < words; i++)
+ if (good[i] != ref[i])
+ return 1;
+
+ return 0;
+}
+
+#define MAX_MSGS 7
+
+int main(void)
+{
+ SHA1_HASH_CTX_MGR *mgr = NULL;
+ SHA1_HASH_CTX ctxpool[MAX_MSGS];
+ SHA1_HASH_CTX *p_job;
+ int i, checked = 0, failed = 0;
+ int n = sizeof(msgs) / sizeof(msgs[0]);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA1_HASH_CTX_MGR));
+ // Initialize multi-buffer manager
+ sha1_ctx_mgr_init(mgr);
+
+ for (i = 0; i < n; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)expected_digest[i];
+
+ p_job = sha1_ctx_mgr_submit(mgr, &ctxpool[i], msgs[i],
+ strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (p_job) { // If we have finished a job, process it
+ checked++;
+ failed += check_job(p_job->job.result_digest, p_job->user_data, SHA1_DIGEST_NWORDS);
+ }
+ }
+
+ // Finish remaining jobs
+ while (NULL != (p_job = sha1_ctx_mgr_flush(mgr))) {
+ checked++;
+ failed += check_job(p_job->job.result_digest, p_job->user_data, SHA1_DIGEST_NWORDS);
+ }
+
+ printf("Example multi-buffer sha1 completed=%d, failed=%d\n", checked, failed);
+ return failed;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm
new file mode 100644
index 000000000..39140a96f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_multibinary.asm
@@ -0,0 +1,83 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha1_ctx_mgr_init_sse
+extern sha1_ctx_mgr_submit_sse
+extern sha1_ctx_mgr_flush_sse
+
+extern sha1_ctx_mgr_init_avx
+extern sha1_ctx_mgr_submit_avx
+extern sha1_ctx_mgr_flush_avx
+
+extern sha1_ctx_mgr_init_avx2
+extern sha1_ctx_mgr_submit_avx2
+extern sha1_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha1_ctx_mgr_init_avx512
+ extern sha1_ctx_mgr_submit_avx512
+ extern sha1_ctx_mgr_flush_avx512
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha1_ctx_mgr_init
+mbin_interface sha1_ctx_mgr_submit
+mbin_interface sha1_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6 through replacing base by sse version
+ mbin_dispatch_init6 sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2, sha1_ctx_mgr_init_avx512
+ mbin_dispatch_init6 sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2, sha1_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2, sha1_ctx_mgr_flush_avx512
+%else
+ mbin_dispatch_init sha1_ctx_mgr_init, sha1_ctx_mgr_init_sse, sha1_ctx_mgr_init_avx, sha1_ctx_mgr_init_avx2
+ mbin_dispatch_init sha1_ctx_mgr_submit, sha1_ctx_mgr_submit_sse, sha1_ctx_mgr_submit_avx, sha1_ctx_mgr_submit_avx2
+ mbin_dispatch_init sha1_ctx_mgr_flush, sha1_ctx_mgr_flush_sse, sha1_ctx_mgr_flush_avx, sha1_ctx_mgr_flush_avx2
+%endif
+
+;;; func core, ver, snum
+slversion sha1_ctx_mgr_init, 00, 03, 0148
+slversion sha1_ctx_mgr_submit, 00, 03, 0149
+slversion sha1_ctx_mgr_flush, 00, 03, 0150
diff --git a/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c
new file mode 100644
index 000000000..0e1702909
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha1_mb/sha1_ref.c
@@ -0,0 +1,228 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha1_mb.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA1 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define H0 0x67452301
+#define H1 0xefcdab89
+#define H2 0x98badcfe
+#define H3 0x10325476
+#define H4 0xc3d2e1f0
+
+#define F1(b,c,d) (d ^ (b & (c ^ d)))
+#define F2(b,c,d) (b ^ c ^ d)
+#define F3(b,c,d) ((b & c) | (d & (b | c)))
+#define F4(b,c,d) (b ^ c ^ d)
+
+#define rol32(x, r) (((x)<<(r)) ^ ((x)>>(32-(r))))
+#define bswap(x) (((x)<<24) | (((x)&0xff00)<<8) | (((x)&0xff0000)>>8) | ((x)>>24))
+
+#define W(x) w[(x) & 15]
+
+#define step00_19(i,a,b,c,d,e) \
+ if (i>15) W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ else W(i) = bswap(ww[i]); \
+ e += rol32(a,5) + F1(b,c,d) + 0x5A827999 + W(i); \
+ b = rol32(b,30)
+
+#define step20_39(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F2(b,c,d) + 0x6ED9EBA1 + W(i); \
+ b = rol32(b,30)
+
+#define step40_59(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F3(b,c,d) + 0x8F1BBCDC + W(i); \
+ b = rol32(b,30)
+
+#define step60_79(i,a,b,c,d,e) \
+ W(i) = rol32(W(i-3)^W(i-8)^W(i-14)^W(i-16), 1); \
+ e += rol32(a,5) + F4(b,c,d) + 0xCA62C1D6 + W(i); \
+ b = rol32(b,30)
+
+void sha1_single(const uint8_t * data, uint32_t digest[]);
+
+void sha1_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA1_BLOCK_SIZE];
+ union {
+ uint64_t uint;
+ uint8_t uchar[8];
+ } convert;
+ uint8_t *p;
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+ digest[4] = H4;
+
+ i = len;
+ while (i >= SHA1_BLOCK_SIZE) {
+ sha1_single(input_data, digest);
+ input_data += SHA1_BLOCK_SIZE;
+ i -= SHA1_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA1_BLOCK_SIZE) - SHA1_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA1_BLOCK_SIZE - SHA1_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA1_BLOCK_SIZE;
+ else
+ i = SHA1_BLOCK_SIZE;
+
+ convert.uint = 8 * len;
+ p = buf + i - 8;
+ p[0] = convert.uchar[7];
+ p[1] = convert.uchar[6];
+ p[2] = convert.uchar[5];
+ p[3] = convert.uchar[4];
+ p[4] = convert.uchar[3];
+ p[5] = convert.uchar[2];
+ p[6] = convert.uchar[1];
+ p[7] = convert.uchar[0];
+
+ sha1_single(buf, digest);
+ if (i == (2 * SHA1_BLOCK_SIZE))
+ sha1_single(buf + SHA1_BLOCK_SIZE, digest);
+}
+
+void sha1_single(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e;
+ uint32_t w[16] = { 0 };
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+
+ step00_19(0, a, b, c, d, e);
+ step00_19(1, e, a, b, c, d);
+ step00_19(2, d, e, a, b, c);
+ step00_19(3, c, d, e, a, b);
+ step00_19(4, b, c, d, e, a);
+ step00_19(5, a, b, c, d, e);
+ step00_19(6, e, a, b, c, d);
+ step00_19(7, d, e, a, b, c);
+ step00_19(8, c, d, e, a, b);
+ step00_19(9, b, c, d, e, a);
+ step00_19(10, a, b, c, d, e);
+ step00_19(11, e, a, b, c, d);
+ step00_19(12, d, e, a, b, c);
+ step00_19(13, c, d, e, a, b);
+ step00_19(14, b, c, d, e, a);
+ step00_19(15, a, b, c, d, e);
+ step00_19(16, e, a, b, c, d);
+ step00_19(17, d, e, a, b, c);
+ step00_19(18, c, d, e, a, b);
+ step00_19(19, b, c, d, e, a);
+
+ step20_39(20, a, b, c, d, e);
+ step20_39(21, e, a, b, c, d);
+ step20_39(22, d, e, a, b, c);
+ step20_39(23, c, d, e, a, b);
+ step20_39(24, b, c, d, e, a);
+ step20_39(25, a, b, c, d, e);
+ step20_39(26, e, a, b, c, d);
+ step20_39(27, d, e, a, b, c);
+ step20_39(28, c, d, e, a, b);
+ step20_39(29, b, c, d, e, a);
+ step20_39(30, a, b, c, d, e);
+ step20_39(31, e, a, b, c, d);
+ step20_39(32, d, e, a, b, c);
+ step20_39(33, c, d, e, a, b);
+ step20_39(34, b, c, d, e, a);
+ step20_39(35, a, b, c, d, e);
+ step20_39(36, e, a, b, c, d);
+ step20_39(37, d, e, a, b, c);
+ step20_39(38, c, d, e, a, b);
+ step20_39(39, b, c, d, e, a);
+
+ step40_59(40, a, b, c, d, e);
+ step40_59(41, e, a, b, c, d);
+ step40_59(42, d, e, a, b, c);
+ step40_59(43, c, d, e, a, b);
+ step40_59(44, b, c, d, e, a);
+ step40_59(45, a, b, c, d, e);
+ step40_59(46, e, a, b, c, d);
+ step40_59(47, d, e, a, b, c);
+ step40_59(48, c, d, e, a, b);
+ step40_59(49, b, c, d, e, a);
+ step40_59(50, a, b, c, d, e);
+ step40_59(51, e, a, b, c, d);
+ step40_59(52, d, e, a, b, c);
+ step40_59(53, c, d, e, a, b);
+ step40_59(54, b, c, d, e, a);
+ step40_59(55, a, b, c, d, e);
+ step40_59(56, e, a, b, c, d);
+ step40_59(57, d, e, a, b, c);
+ step40_59(58, c, d, e, a, b);
+ step40_59(59, b, c, d, e, a);
+
+ step60_79(60, a, b, c, d, e);
+ step60_79(61, e, a, b, c, d);
+ step60_79(62, d, e, a, b, c);
+ step60_79(63, c, d, e, a, b);
+ step60_79(64, b, c, d, e, a);
+ step60_79(65, a, b, c, d, e);
+ step60_79(66, e, a, b, c, d);
+ step60_79(67, d, e, a, b, c);
+ step60_79(68, c, d, e, a, b);
+ step60_79(69, b, c, d, e, a);
+ step60_79(70, a, b, c, d, e);
+ step60_79(71, e, a, b, c, d);
+ step60_79(72, d, e, a, b, c);
+ step60_79(73, c, d, e, a, b);
+ step60_79(74, b, c, d, e, a);
+ step60_79(75, a, b, c, d, e);
+ step60_79(76, e, a, b, c, d);
+ step60_79(77, d, e, a, b, c);
+ step60_79(78, c, d, e, a, b);
+ step60_79(79, b, c, d, e, a);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
new file mode 100644
index 000000000..79b223545
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/Makefile.am
@@ -0,0 +1,87 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc += sha256_mb/sha256_ctx_sse.c \
+ sha256_mb/sha256_ctx_avx.c \
+ sha256_mb/sha256_ctx_avx2.c
+
+lsrc += sha256_mb/sha256_mb_mgr_init_sse.c \
+ sha256_mb/sha256_mb_mgr_init_avx2.c
+
+
+lsrc += sha256_mb/sha256_mb_mgr_submit_sse.asm \
+ sha256_mb/sha256_mb_mgr_submit_avx.asm \
+ sha256_mb/sha256_mb_mgr_submit_avx2.asm \
+ sha256_mb/sha256_mb_mgr_flush_sse.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx2.asm \
+ sha256_mb/sha256_mb_x4_sse.asm \
+ sha256_mb/sha256_mb_x4_avx.asm \
+ sha256_mb/sha256_mb_x8_avx2.asm \
+ sha256_mb/sha256_multibinary.asm
+
+lsrc += sha256_mb/sha256_ctx_avx512.c \
+ sha256_mb/sha256_mb_mgr_init_avx512.c \
+ sha256_mb/sha256_mb_mgr_submit_avx512.asm \
+ sha256_mb/sha256_mb_mgr_flush_avx512.asm \
+ sha256_mb/sha256_mb_x16_avx512.asm
+
+extern_hdrs += include/sha256_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ include/multibinary.asm \
+ sha256_mb/sha256_job.asm \
+ sha256_mb/sha256_mb_mgr_datastruct.asm \
+ include/reg_sizes.asm \
+ sha256_mb/sha256_ref.c \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h
+
+check_tests += sha256_mb/sha256_mb_test \
+ sha256_mb/sha256_mb_rand_test \
+ sha256_mb/sha256_mb_rand_update_test
+
+unit_tests += sha256_mb/sha256_mb_rand_ssl_test
+
+perf_tests += sha256_mb/sha256_mb_vs_ossl_perf
+
+sha256_mb_rand_ssl_test: sha256_ref.o
+sha256_mb_rand_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_update_test: sha256_ref.o
+sha256_mb_sha256_mb_rand_update_test_LDADD = sha256_mb/sha256_ref.lo libisal_crypto.la
+
+sha256_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha256_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha256_mb_sha256_mb_vs_ossl_perf_LDFLAGS = -lcrypto
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
new file mode 100644
index 000000000..25a5d01d6
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx.c
@@ -0,0 +1,253 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx_slver_02020154;
+struct slver sha256_ctx_mgr_init_avx_slver = { 0x0154, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_submit_avx_slver_02020155;
+struct slver sha256_ctx_mgr_submit_avx_slver = { 0x0155, 0x02, 0x02 };
+
+struct slver sha256_ctx_mgr_flush_avx_slver_02020156;
+struct slver sha256_ctx_mgr_flush_avx_slver = { 0x0156, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
new file mode 100644
index 000000000..c319383f1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx2.c
@@ -0,0 +1,253 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx2(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx2(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx2_slver_04020157;
+struct slver sha256_ctx_mgr_init_avx2_slver = { 0x0157, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_submit_avx2_slver_04020158;
+struct slver sha256_ctx_mgr_submit_avx2_slver = { 0x0158, 0x02, 0x04 };
+
+struct slver sha256_ctx_mgr_flush_avx2_slver_04020159;
+struct slver sha256_ctx_mgr_flush_avx2_slver = { 0x0159, 0x02, 0x04 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
new file mode 100644
index 000000000..08acb69e5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_avx512.c
@@ -0,0 +1,258 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_avx512(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_avx512(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_avx512_slver_0600015a;
+struct slver sha256_ctx_mgr_init_avx512_slver = { 0x015a, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_submit_avx512_slver_0600015b;
+struct slver sha256_ctx_mgr_submit_avx512_slver = { 0x015b, 0x00, 0x06 };
+
+struct slver sha256_ctx_mgr_flush_avx512_slver_0600015c;
+struct slver sha256_ctx_mgr_flush_avx512_slver = { 0x015c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
new file mode 100644
index 000000000..002d3b93e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ctx_sse.c
@@ -0,0 +1,255 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA256_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx);
+
+void sha256_ctx_mgr_init_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+ sha256_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_submit_sse(SHA256_HASH_CTX_MGR * mgr, SHA256_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA256_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA256_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA256_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA256_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha256_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA256_HASH_CTX *sha256_ctx_mgr_flush_sse(SHA256_HASH_CTX_MGR * mgr)
+{
+ SHA256_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha256_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha256_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA256_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA256_HASH_CTX *sha256_ctx_mgr_resubmit(SHA256_HASH_CTX_MGR * mgr,
+ SHA256_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA256_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA256_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA256_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+
+ ctx = (SHA256_HASH_CTX *) sha256_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA256_WORD_T * digest)
+{
+ static const SHA256_WORD_T hash_initial_digest[SHA256_DIGEST_NWORDS] =
+ { SHA256_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA256_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA256_BLOCK_SIZE - 1) & (0 - (total_len + SHA256_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA256_PADLENGTHFIELD_SIZE;
+
+#if SHA256_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA256_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha256_ctx_mgr_init_sse_slver_00020151;
+struct slver sha256_ctx_mgr_init_sse_slver = { 0x0151, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_submit_sse_slver_00020152;
+struct slver sha256_ctx_mgr_submit_sse_slver = { 0x0152, 0x02, 0x00 };
+
+struct slver sha256_ctx_mgr_flush_sse_slver_00020153;
+struct slver sha256_ctx_mgr_flush_sse_slver = { 0x0153, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
new file mode 100644
index 000000000..6bf4b5e50
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_job.asm
@@ -0,0 +1,54 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA256_JOB
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 8, 8 ; length in bytes
+FIELD _result_digest, 8*4, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+
+%assign _SHA256_JOB_size _FIELD_OFFSET
+%assign _SHA256_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..e2b11532b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_datastruct.asm
@@ -0,0 +1,74 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA256 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA256_ARGS_X16
+;;; name size align
+FIELD _digest, 4*8*16, 4 ; transposed digest
+FIELD _data_ptr, 8*16, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SHA256_ARGS_X4_size _FIELD_OFFSET
+%assign _SHA256_ARGS_X4_align _STRUCT_ALIGN
+%assign _SHA256_ARGS_X8_size _FIELD_OFFSET
+%assign _SHA256_ARGS_X8_align _STRUCT_ALIGN
+%assign _SHA256_ARGS_X16_size _FIELD_OFFSET
+%assign _SHA256_ARGS_X16_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SHA256_ARGS_X4_size, _SHA256_ARGS_X4_align
+FIELD _lens, 4*16, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*16, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..fcd08aa57
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx.asm
@@ -0,0 +1,230 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha256_mb_mgr_flush_avx:function
+sha256_mb_mgr_flush_avx:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 16+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..ca719586b
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx2.asm
@@ -0,0 +1,252 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha1_mult
+%define idx r8
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx2(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha256_mb_mgr_flush_avx2:function
+sha256_mb_mgr_flush_avx2:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ ; if bit (32+3) is set, then all lanes are empty
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+ cmp qword [state + _ldata + 4 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [four]
+ cmp qword [state + _ldata + 5 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [five]
+ cmp qword [state + _ldata + 6 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [six]
+ cmp qword [state + _ldata + 7 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [seven]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x8_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+one: dq 1
+two: dq 2
+three: dq 3
+four: dq 4
+five: dq 5
+six: dq 6
+seven: dq 7
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..7f065b11d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_avx512.asm
@@ -0,0 +1,267 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define tmp4 rdx
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%else
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define tmp4 rsi
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+%endif
+
+; Common register definitions
+
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+; idx must be a register not clobberred by sha1_mult
+%define idx r8
+
+%define num_lanes_inuse r9
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*8
+_ALIGN_SIZE equ 8
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_avx512(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha256_mb_mgr_flush_avx512:function
+sha256_mb_mgr_flush_avx512:
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 15
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 16
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x16_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+lane_8: dq 8
+lane_9: dq 9
+lane_10: dq 10
+lane_11: dq 11
+lane_12: dq 12
+lane_13: dq 13
+lane_14: dq 14
+lane_15: dq 15
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_flush_avx512
+no_sha256_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..9434d1a8a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_flush_sse.asm
@@ -0,0 +1,230 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_sse
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA256_JOB* sha256_mb_mgr_flush_sse(SHA256_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha256_mb_mgr_flush_sse:function
+sha256_mb_mgr_flush_sse:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 16+3
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..627eaf6e2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx2.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx2(SHA256_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF76543210;
+ for (j = 0; j < SHA256_X8_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..5c7bb0d4e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_avx512.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_avx512(SHA256_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xfedcba9876543210;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA256_MAX_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
new file mode 100644
index 000000000..0610bf69d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_init_sse.c
@@ -0,0 +1,40 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha256_mb.h"
+
+void sha256_mb_mgr_init_sse(SHA256_MB_JOB_MGR * state)
+{
+ unsigned int j;
+ state->unused_lanes = 0xF3210;
+ for (j = 0; j < SHA256_MIN_LANES; j++) {
+ state->lens[j] = 0;
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..31d626a67
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx.asm
@@ -0,0 +1,253 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_avx
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE 16*10
+%define _GPR_SAVE 8*5
+%define STACK_SPACE _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha256_mb_mgr_submit_avx:function
+sha256_mb_mgr_submit_avx:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ vmovdqa [rsp + 16*7], xmm13
+ vmovdqa [rsp + 16*8], xmm14
+ vmovdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ vmovdqa xmm0, [job + _result_digest + 0*16]
+ vmovdqa xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*16], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ vmovdqa xmm13, [rsp + 16*7]
+ vmovdqa xmm14, [rsp + 16*8]
+ vmovdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..70b57db56
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx2.asm
@@ -0,0 +1,239 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x8_avx2
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx2(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha256_mb_mgr_submit_avx2:function
+sha256_mb_mgr_submit_avx2:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*8], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*8], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*8], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*8], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*8], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*8], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*8], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*8], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xf
+ jne return_null
+
+start_loop:
+ ; Find min length
+ vmovdqa xmm0, [state + _lens + 0*16]
+ vmovdqa xmm1, [state + _lens + 1*16]
+
+ vpminud xmm2, xmm0, xmm1 ; xmm2 has {D,C,B,A}
+ vpalignr xmm3, xmm3, xmm2, 8 ; xmm3 has {x,x,D,C}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has {x,x,E,F}
+ vpalignr xmm3, xmm3, xmm2, 4 ; xmm3 has {x,x,x,E}
+ vpminud xmm2, xmm2, xmm3 ; xmm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand xmm2, xmm2, [rel clear_low_nibble]
+ vpshufd xmm2, xmm2, 0
+
+ vpsubd xmm0, xmm0, xmm2
+ vpsubd xmm1, xmm1, xmm2
+
+ vmovdqa [state + _lens + 0*16], xmm0
+ vmovdqa [state + _lens + 1*16], xmm1
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x8_avx2
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*8]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*8], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*8], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*8], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*8]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*8], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*8], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*8], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..285189fec
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_avx512.asm
@@ -0,0 +1,257 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "memcpy.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+extern sha256_mb_x16_avx512
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define idx r8
+%define last_len r8
+%define p r11
+%define start_offset r11
+%define num_lanes_inuse r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+
+%define tmp r9
+
+%define lane_data r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define STACK_SPACE 8*8 + 16*10 + 8
+
+; SHA256_JOB* sha256_mb_mgr_submit_avx512(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha256_mb_mgr_submit_avx512:function
+sha256_mb_mgr_submit_avx512:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + 8*0], rbx
+ mov [rsp + 8*3], rbp
+ mov [rsp + 8*4], r12
+ mov [rsp + 8*5], r13
+ mov [rsp + 8*6], r14
+ mov [rsp + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + 8*1], rsi
+ mov [rsp + 8*2], rdi
+ vmovdqa [rsp + 8*8 + 16*0], xmm6
+ vmovdqa [rsp + 8*8 + 16*1], xmm7
+ vmovdqa [rsp + 8*8 + 16*2], xmm8
+ vmovdqa [rsp + 8*8 + 16*3], xmm9
+ vmovdqa [rsp + 8*8 + 16*4], xmm10
+ vmovdqa [rsp + 8*8 + 16*5], xmm11
+ vmovdqa [rsp + 8*8 + 16*6], xmm12
+ vmovdqa [rsp + 8*8 + 16*7], xmm13
+ vmovdqa [rsp + 8*8 + 16*8], xmm14
+ vmovdqa [rsp + 8*8 + 16*9], xmm15
+%endif
+ mov unused_lanes, [state + _unused_lanes]
+ mov lane, unused_lanes
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ mov [lane_data + _job_in_lane], job
+
+ ; Load digest words from result_digest
+ vmovdqu xmm0, [job + _result_digest + 0*16]
+ vmovdqu xmm1, [job + _result_digest + 1*16]
+ vmovd [state + _args_digest + 4*lane + 0*4*16], xmm0
+ vpextrd [state + _args_digest + 4*lane + 1*4*16], xmm0, 1
+ vpextrd [state + _args_digest + 4*lane + 2*4*16], xmm0, 2
+ vpextrd [state + _args_digest + 4*lane + 3*4*16], xmm0, 3
+ vmovd [state + _args_digest + 4*lane + 4*4*16], xmm1
+ vpextrd [state + _args_digest + 4*lane + 5*4*16], xmm1, 1
+ vpextrd [state + _args_digest + 4*lane + 6*4*16], xmm1, 2
+ vpextrd [state + _args_digest + 4*lane + 7*4*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 16
+ jne return_null
+
+start_loop:
+ ; Find min length, ymm0 holds ahead 8, ymm1 holds rear 8
+ vmovdqu ymm0, [state + _lens + 0*32]
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminud ymm2, ymm0, ymm1 ; ymm2 has {H1,G1,F1,E1,D1,C1,B1,A1}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,x,H1,G1,x,x,D1,C1}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x,H2,G2,x,x,D2,C2}
+ vpalignr ymm3, ymm3, ymm2, 4 ; ymm3 has {x,x, x,H2,x,x, x,D2}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has {x,x, x,G3,x,x, x,C3}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,x, x, x,x,x, x,C3}
+ vpminud ymm2, ymm2, ymm3 ; ymm2 has min value in low dword
+
+ vmovd DWORD(idx), xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 4
+ jz len_is_0
+
+ vpand ymm2, ymm2, [rel clear_low_nibble]
+ vpshufd ymm2, ymm2, 0
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x16_avx512
+
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovd xmm0, [state + _args_digest + 4*idx + 0*4*16]
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 1*4*16], 1
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 2*4*16], 2
+ vpinsrd xmm0, [state + _args_digest + 4*idx + 3*4*16], 3
+ vmovd xmm1, [state + _args_digest + 4*idx + 4*4*16]
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 5*4*16], 1
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 6*4*16], 2
+ vpinsrd xmm1, [state + _args_digest + 4*idx + 7*4*16], 3
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 8*8 + 16*0]
+ vmovdqa xmm7, [rsp + 8*8 + 16*1]
+ vmovdqa xmm8, [rsp + 8*8 + 16*2]
+ vmovdqa xmm9, [rsp + 8*8 + 16*3]
+ vmovdqa xmm10, [rsp + 8*8 + 16*4]
+ vmovdqa xmm11, [rsp + 8*8 + 16*5]
+ vmovdqa xmm12, [rsp + 8*8 + 16*6]
+ vmovdqa xmm13, [rsp + 8*8 + 16*7]
+ vmovdqa xmm14, [rsp + 8*8 + 16*8]
+ vmovdqa xmm15, [rsp + 8*8 + 16*9]
+ mov rsi, [rsp + 8*1]
+ mov rdi, [rsp + 8*2]
+%endif
+ mov rbx, [rsp + 8*0]
+ mov rbp, [rsp + 8*3]
+ mov r12, [rsp + 8*4]
+ mov r13, [rsp + 8*5]
+ mov r14, [rsp + 8*6]
+ mov r15, [rsp + 8*7]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=32
+
+align 32
+clear_low_nibble:
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+ dq 0x00000000FFFFFFF0, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_mgr_submit_avx512
+no_sha256_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..1f85f1014
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_mgr_submit_sse.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_job.asm"
+%include "sha256_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha256_mb_x4_sse
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE 16*10
+%define _GPR_SAVE 8*5
+%define STACK_SPACE _GPR_SAVE + _XMM_SAVE
+
+; SHA256_JOB* sha256_mb_mgr_submit_sse(SHA256_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha256_mb_mgr_submit_sse:function
+sha256_mb_mgr_submit_sse:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ movdqa [rsp + 16*0], xmm6
+ movdqa [rsp + 16*1], xmm7
+ movdqa [rsp + 16*2], xmm8
+ movdqa [rsp + 16*3], xmm9
+ movdqa [rsp + 16*4], xmm10
+ movdqa [rsp + 16*5], xmm11
+ movdqa [rsp + 16*6], xmm12
+ movdqa [rsp + 16*7], xmm13
+ movdqa [rsp + 16*8], xmm14
+ movdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ and lane, 0xF
+ shr unused_lanes, 4
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ shl len, 4
+ or len, lane
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ movdqa xmm1, [job + _result_digest + 1*16]
+ movd [state + _args_digest + 4*lane + 0*16], xmm0
+ pextrd [state + _args_digest + 4*lane + 1*16], xmm0, 1
+ pextrd [state + _args_digest + 4*lane + 2*16], xmm0, 2
+ pextrd [state + _args_digest + 4*lane + 3*16], xmm0, 3
+ movd [state + _args_digest + 4*lane + 4*16], xmm1
+ pextrd [state + _args_digest + 4*lane + 5*16], xmm1, 1
+ pextrd [state + _args_digest + 4*lane + 6*16], xmm1, 2
+ pextrd [state + _args_digest + 4*lane + 7*16], xmm1, 3
+
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xF
+ jne return_null
+
+start_loop:
+ ; Find min length
+ mov DWORD(lens0), [state + _lens + 0*4]
+ mov idx, lens0
+ mov DWORD(lens1), [state + _lens + 1*4]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov DWORD(lens2), [state + _lens + 2*4]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov DWORD(lens3), [state + _lens + 3*4]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 4
+ mov [state + _lens + 0*4], DWORD(lens0)
+ mov [state + _lens + 1*4], DWORD(lens1)
+ mov [state + _lens + 2*4], DWORD(lens2)
+ mov [state + _lens + 3*4], DWORD(lens3)
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha256_mb_x4_sse
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 4
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ movd xmm0, [state + _args_digest + 4*idx + 0*16]
+ pinsrd xmm0, [state + _args_digest + 4*idx + 1*16], 1
+ pinsrd xmm0, [state + _args_digest + 4*idx + 2*16], 2
+ pinsrd xmm0, [state + _args_digest + 4*idx + 3*16], 3
+ movd xmm1, [state + _args_digest + 4*idx + 4*16]
+ pinsrd xmm1, [state + _args_digest + 4*idx + 5*16], 1
+ pinsrd xmm1, [state + _args_digest + 4*idx + 6*16], 2
+ pinsrd xmm1, [state + _args_digest + 4*idx + 7*16], 3
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 16*0]
+ movdqa xmm7, [rsp + 16*1]
+ movdqa xmm8, [rsp + 16*2]
+ movdqa xmm9, [rsp + 16*3]
+ movdqa xmm10, [rsp + 16*4]
+ movdqa xmm11, [rsp + 16*5]
+ movdqa xmm12, [rsp + 16*6]
+ movdqa xmm13, [rsp + 16*7]
+ movdqa xmm14, [rsp + 16*8]
+ movdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
new file mode 100644
index 000000000..70d86d9ed
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_ssl_test.c
@@ -0,0 +1,158 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+unsigned int byteswap(unsigned int x)
+{
+ return (x >> 24) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24);
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+
+ printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ byteswap(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ byteswap(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Ramdom buffer with ramdom len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ SHA256(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ byteswap(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ byteswap(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
new file mode 100644
index 000000000..350d3b6bb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_test.c
@@ -0,0 +1,197 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+
+ printf("multibinary_sha256 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ sha256_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%08X <=> 0x%08X \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sha256_mb test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%08X <=> 0x%08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sha256_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_sha256 test
+ sha256_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%08X"
+ ", ref: 0x%08X\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
new file mode 100644
index 000000000..1cd6583af
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_rand_update_test.c
@@ -0,0 +1,294 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha256_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SHA256_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA256_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint32_t digest_ref[TEST_BUFS][SHA256_DIGEST_NWORDS];
+
+extern void sha256_ref(uint8_t * input_data, uint32_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+
+ printf("multibinary_sha256_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ sha256_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha256_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_sha256 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha256_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha256_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha256_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8X <=> %8X",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sha256_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ sha256_ctx_mgr_init(mgr);
+
+ // Run sha256_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SHA256_BLOCK_SIZE +
+ SHA256_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SHA256_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem,
+ HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha256_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha256_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SHA256_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha256_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8X <=> %8X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
new file mode 100644
index 000000000..60684e155
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_test.c
@@ -0,0 +1,235 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha256_mb.h"
+
+typedef uint32_t DigestSHA256[SHA256_DIGEST_NWORDS];
+
+#define MSGS 7
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq";
+static uint8_t msg2[] = "0123456789:;<=>?@ABCDEFGHIJKLMNO";
+static uint8_t msg3[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<";
+static uint8_t msg4[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQR";
+static uint8_t msg5[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?";
+static uint8_t msg6[] =
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX" "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWX"
+ "0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTU";
+static uint8_t msg7[] = "";
+
+static DigestSHA256 expResultDigest1 = { 0x248D6A61, 0xD20638B8, 0xE5C02693, 0x0C3E6039,
+ 0xA33CE459, 0x64FF2167, 0xF6ECEDD4, 0x19DB06C1
+};
+
+static DigestSHA256 expResultDigest2 = { 0xD9C2E699, 0x586B948F, 0x4022C799, 0x4FFE14C6,
+ 0x3A4E8E31, 0x2EE2AEE1, 0xEBE51BED, 0x85705CFD
+};
+
+static DigestSHA256 expResultDigest3 = { 0xE3057651, 0x81295681, 0x7ECF1791, 0xFF9A1619,
+ 0xB2BC5CAD, 0x2AC00018, 0x92AE489C, 0x48DD10B3
+};
+
+static DigestSHA256 expResultDigest4 = { 0x0307DAA3, 0x7130A140, 0x270790F9, 0x95B71407,
+ 0x8EC752A6, 0x084EC1F3, 0xBD873D79, 0x3FF78383
+};
+
+static DigestSHA256 expResultDigest5 = { 0x679312F7, 0x2E18D599, 0x5F51BDC6, 0x4ED56AFD,
+ 0x9B5704D3, 0x4387E11C, 0xC2331089, 0x2CD45DAA
+};
+
+static DigestSHA256 expResultDigest6 = { 0x8B1767E9, 0x7BA7BBE5, 0xF9A6E8D9, 0x9996904F,
+ 0x3AF6562E, 0xA58AF438, 0x5D8D584B, 0x81C808CE
+};
+
+static DigestSHA256 expResultDigest7 = { 0xE3B0C442, 0x98FC1C14, 0x9AFBF4C8, 0x996FB924,
+ 0x27AE41E4, 0x649B934C, 0xA495991B, 0x7852B855
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7 };
+
+static uint32_t *expResultDigest[MSGS] = {
+ expResultDigest1, expResultDigest2, expResultDigest3,
+ expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7
+};
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint32_t *good;
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ sha256_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+
+ }
+ }
+
+ while (1) {
+ ctx = sha256_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+ ctx = sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sha256_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %08X, should be %08X\n",
+ t, j, ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the submit."
+ " Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sha256 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..07184ae19
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_vs_ossl_perf.c
@@ -0,0 +1,130 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha256_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 4000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 20
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][4 * SHA256_DIGEST_NWORDS];
+
+inline unsigned int byteswap32(unsigned int x)
+{
+ return (x >> 24) | (x >> 8 & 0xff00) | (x << 8 & 0xff0000) | (x << 24);
+}
+
+int main(void)
+{
+ SHA256_HASH_CTX_MGR *mgr = NULL;
+ SHA256_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t) TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA256_HASH_CTX_MGR));
+ sha256_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA256(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha256_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sha256_ctx_mgr_submit(mgr,
+ &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sha256_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha256" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA256_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ byteswap32(((uint32_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %08X <=> %08X\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ byteswap32(((uint32_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sha256 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha256_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
new file mode 100644
index 000000000..13ec15fd9
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x16_avx512.asm
@@ -0,0 +1,927 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+
+default rel
+
+;; code to compute oct SHA256 using SSE-256 / AVX512
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, 8*64, 64
+FIELD _rsp, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var1 rdx ; arg2
+ %define var2 rcx ; arg3
+ %define local_func_decl(func_name) global func_name:function internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+
+%define IDX var1
+%define TBL var2
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%macro TRANSPOSE16 18
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%r8 %9
+%define %%r9 %10
+%define %%r10 %11
+%define %%r11 %12
+%define %%r12 %13
+%define %%r13 %14
+%define %%r14 %15
+%define %%r15 %16
+%define %%t0 %17
+%define %%t1 %18
+
+; r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
+; r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
+; r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
+; r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
+; r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
+; r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
+; r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
+; r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
+; r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
+
+; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+
+
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
+
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%r2, %%r2, %%t1, 0x88 ; r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
+
+ ; use r6 in place of t0
+ vshufps %%r6, %%r8, %%r9, 0x44 ; r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
+ vshufps %%r8, %%r8, %%r9, 0xEE ; r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
+ vshufps %%t1, %%r10, %%r11, 0x44 ; t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
+ vshufps %%r10, %%r10, %%r11, 0xEE ; r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
+
+ vshufps %%r11, %%r6, %%t1, 0xDD ; r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
+ vshufps %%r9, %%r8, %%r10, 0x88 ; r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
+ vshufps %%r8, %%r8, %%r10, 0xDD ; r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
+ vshufps %%r6, %%r6, %%t1, 0x88 ; r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
+
+ ; use r10 in place of t0
+ vshufps %%r10, %%r12, %%r13, 0x44 ; r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
+ vshufps %%r12, %%r12, %%r13, 0xEE ; r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
+ vshufps %%t1, %%r14, %%r15, 0x44 ; t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
+ vshufps %%r14, %%r14, %%r15, 0xEE ; r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
+
+ vshufps %%r15, %%r10, %%t1, 0xDD ; r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
+ vshufps %%r13, %%r12, %%r14, 0x88 ; r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
+ vshufps %%r12, %%r12, %%r14, 0xDD ; r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
+ vshufps %%r10, %%r10, %%t1, 0x88 ; r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
+
+;; At this point, the registers that contain interesting data are:
+;; t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
+;; Can use t1 and r14 as scratch registers
+
+ vmovdqa32 %%r14, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r14, %%t0, %%r2 ; r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
+ vmovdqa32 %%t1, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t1, %%t0, %%r2 ; t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vmovdqa32 %%r2, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r2, %%r3, %%r7 ; r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
+ vmovdqa32 %%t0, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%t0, %%r3, %%r7 ; t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vmovdqa32 %%r3, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r3, %%r1, %%r5 ; r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r7, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r7, %%r1, %%r5 ; r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vmovdqa32 %%r1, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r1, %%r0, %%r4 ; r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
+ vmovdqa32 %%r5, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r5, %%r0, %%r4 ; r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vmovdqa32 %%r0, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r0, %%r6, %%r10 ; r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
+ vmovdqa32 %%r4, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r4, %%r6, %%r10 ; r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
+
+ vmovdqa32 %%r6, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r6, %%r11, %%r15 ; r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
+ vmovdqa32 %%r10, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r10, %%r11, %%r15 ; r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
+
+ vmovdqa32 %%r11, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r11, %%r9, %%r13 ; r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
+ vmovdqa32 %%r15, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r15, %%r9, %%r13 ; r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
+
+ vmovdqa32 %%r9, [PSHUFFLE_TRANSPOSE16_MASK1]
+ vpermi2q %%r9, %%r8, %%r12 ; r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
+ vmovdqa32 %%r13, [PSHUFFLE_TRANSPOSE16_MASK2]
+ vpermi2q %%r13, %%r8, %%r12 ; r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
+
+;; At this point r8 and r12 can be used as scratch registers
+
+ vshuff64x2 %%r8, %%r14, %%r0, 0xEE ; r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
+ vshuff64x2 %%r0, %%r14, %%r0, 0x44 ; r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
+
+ vshuff64x2 %%r12, %%t1, %%r4, 0xEE ; r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
+ vshuff64x2 %%r4, %%t1, %%r4, 0x44 ; r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
+
+ vshuff64x2 %%r14, %%r7, %%r15, 0xEE ; r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
+ vshuff64x2 %%t1, %%r7, %%r15, 0x44 ; t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+ vshuff64x2 %%r15, %%r5, %%r13, 0xEE ; r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
+ vshuff64x2 %%r7, %%r5, %%r13, 0x44 ; r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r13, %%t0, %%r10, 0xEE ; r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
+ vshuff64x2 %%r5, %%t0, %%r10, 0x44 ; r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
+
+ vshuff64x2 %%r10, %%r3, %%r11, 0xEE ; r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
+ vshuff64x2 %%t0, %%r3, %%r11, 0x44 ; t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+
+ vshuff64x2 %%r11, %%r1, %%r9, 0xEE ; r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
+ vshuff64x2 %%r3, %%r1, %%r9, 0x44 ; r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
+
+ vshuff64x2 %%r9, %%r2, %%r6, 0xEE ; r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
+ vshuff64x2 %%r1, %%r2, %%r6, 0x44 ; r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
+
+ vmovdqa32 %%r2, %%t0 ; r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
+ vmovdqa32 %%r6, %%t1 ; r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
+
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
+;; SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
+;; sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
+;; sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
+
+; Main processing loop per round
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddd T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, T1, %%WT ; T1 = T1 + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpaddd D, D, T1 ; D = D + T1
+
+ vprord H, A, 2 ; ROR_2(A)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vmovdqa32 TMP0, A
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+; This is supposed to be SKL optimized assuming:
+; vpternlog, vpaddd ports 5,8
+; vprord ports 1,8
+; However, vprord is only working on port 8
+;
+; Main processing loop per round
+; Get the msg schedule word 16 from the current, now unneccessary word
+%macro PROCESS_LOOP_00_47 5
+%define %%WT %1
+%define %%ROUND %2
+%define %%WTp1 %3
+%define %%WTp9 %4
+%define %%WTp14 %5
+ ;; T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = SIGMA0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ ;; For next value in msg schedule
+ ;; Wt+16 = sigma1(Wt+14) + Wt+9 + sigma0(Wt+1) + Wt
+
+ vmovdqa32 TMP0, E
+ vprord TMP1, E, 6 ; ROR_6(E)
+ vprord TMP2, E, 11 ; ROR_11(E)
+ vprord TMP3, E, 25 ; ROR_25(E)
+ vpternlogd TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddd T1, H, %%WT ; T1 = H + Wt
+ vpternlogd TMP1, TMP2, TMP3, 0x96 ; TMP1 = SIGMA1(E)
+ vpaddd T1, T1, TMP6 ; T1 = T1 + Kt
+ vprord H, A, 2 ; ROR_2(A)
+ vpaddd T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vprord TMP2, A, 13 ; ROR_13(A)
+ vmovdqa32 TMP0, A
+ vprord TMP3, A, 22 ; ROR_22(A)
+ vpaddd T1, T1, TMP1 ; T1 = T1 + SIGMA1(E)
+ vpternlogd TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpaddd D, D, T1 ; D = D + T1
+ vpternlogd H, TMP2, TMP3, 0x96 ; H(T2) = SIGMA0(A)
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vpaddd H, H, TMP0 ; H(T2) = SIGMA0(A) + MAJ(A,B,C)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpaddd H, H, T1 ; H(A) = H(T2) + T1
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+
+ vmovdqa32 TMP6, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_63 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprord TMP4, %%WTp14, 17 ; ROR_17(Wt-2)
+ vprord TMP5, %%WTp14, 19 ; ROR_19(Wt-2)
+ vpsrld TMP6, %%WTp14, 10 ; SHR_10(Wt-2)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma1(Wt-2)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2)
+ vpaddd %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma1(Wt-2) + Wt-7
+
+ vprord TMP4, %%WTp1, 7 ; ROR_7(Wt-15)
+ vprord TMP5, %%WTp1, 18 ; ROR_18(Wt-15)
+ vpsrld TMP6, %%WTp1, 3 ; SHR_3(Wt-15)
+ vpternlogd TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma0(Wt-15)
+
+ vpaddd %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma1(Wt-2) +
+ ; Wt-7 + sigma0(Wt-15) +
+%endmacro
+
+; Note this is reading in a block of data for one lane
+; When all 16 are read, the data must be transposed to build msg schedule
+%macro MSG_SCHED_ROUND_00_15 2
+%define %%WT %1
+%define %%OFFSET %2
+ mov inp0, [IN + (%%OFFSET*8)]
+ vmovups %%WT, [inp0+IDX]
+%endmacro
+
+align 64
+
+;; void sha256_mb_x16_avx512(SHA256_MB_ARGS_X16, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha256_mb_x16_avx512)
+sha256_mb_x16_avx512:
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _rsp], rax
+ lea TBL, [TABLE]
+
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*64]
+ vmovups B, [DIGEST + 1*64]
+ vmovups C, [DIGEST + 2*64]
+ vmovups D, [DIGEST + 3*64]
+ vmovups E, [DIGEST + 4*64]
+ vmovups F, [DIGEST + 5*64]
+ vmovups G, [DIGEST + 6*64]
+ vmovups H, [DIGEST + 7*64]
+
+ ; Do we need to transpose digests???
+ ; SHA1 does not, but SHA256 has been
+
+ xor IDX, IDX
+
+ ;; Read in first block of input data
+ ;; Transpose input data
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+ vmovups W0,[inp0+IDX]
+ vmovups W1,[inp1+IDX]
+ vmovups W2,[inp2+IDX]
+ vmovups W3,[inp3+IDX]
+ vmovups W4,[inp4+IDX]
+ vmovups W5,[inp5+IDX]
+ vmovups W6,[inp6+IDX]
+ vmovups W7,[inp7+IDX]
+
+ mov inp0, [IN + 8*8]
+ mov inp1, [IN + 9*8]
+ mov inp2, [IN +10*8]
+ mov inp3, [IN +11*8]
+ mov inp4, [IN +12*8]
+ mov inp5, [IN +13*8]
+ mov inp6, [IN +14*8]
+ mov inp7, [IN +15*8]
+
+ vmovups W8, [inp0+IDX]
+ vmovups W9, [inp1+IDX]
+ vmovups W10,[inp2+IDX]
+ vmovups W11,[inp3+IDX]
+ vmovups W12,[inp4+IDX]
+ vmovups W13,[inp5+IDX]
+ vmovups W14,[inp6+IDX]
+ vmovups W15,[inp7+IDX]
+
+
+lloop:
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ vmovdqa32 TMP3, [TBL] ; First K
+
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 64
+
+ TRANSPOSE16 W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1
+
+%assign I 0
+%rep 16
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first 48 rounds
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+ ; PROCESS_LOOP_00_47 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep 48
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_63 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+
+ ; Check is this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+ ; Process last 16 rounds
+ ; Read in next block msg data for use in first 16 words of msg sched
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_00_15 APPEND(W,J), J
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ jmp lloop
+
+lastLoop:
+ ; Process last 16 rounds
+%assign I 48
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddd A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddd B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddd C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddd D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddd E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddd F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddd G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddd H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ ;; update into data pointers
+%assign I 0
+%rep 8
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ ; Write out digest
+ ; Do we need to untranspose digests???
+ vmovups [DIGEST + 0*64], A
+ vmovups [DIGEST + 1*64], B
+ vmovups [DIGEST + 2*64], C
+ vmovups [DIGEST + 3*64], D
+ vmovups [DIGEST + 4*64], E
+ vmovups [DIGEST + 5*64], F
+ vmovups [DIGEST + 6*64], G
+ vmovups [DIGEST + 7*64], H
+
+
+ mov rsp, [rsp + _rsp]
+ ret
+
+ section .data
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+PSHUFFLE_TRANSPOSE16_MASK1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+PSHUFFLE_TRANSPOSE16_MASK2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha256_mb_x16_avx512
+no_sha256_mb_x16_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
new file mode 100644
index 000000000..5539d328f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_avx.asm
@@ -0,0 +1,428 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+default rel
+
+;; code to compute quad SHA256 using AVX
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ vshufps %%r1, %%t0, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ vshufps %%r3, %%r0, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ vshufps %%r0, %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE K256_4_MB
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+
+
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ4*(%%i&0xf) + rsp], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ vpaddd a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE 8*SZ4
+%define DATA 16*SZ4
+%define ALIGNMENT 1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define VMOVPS vmovups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define IDX rax
+%define ROUND rbx
+%define TBL r12
+
+;; void sha256_mb_x4_avx(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 : arg1 : pointer args (only 4 of the 8 lanes used)
+;; arg 2 : arg2 : size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+global sha256_mb_x4_avx:function internal
+align 32
+sha256_mb_x4_avx:
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ vmovdqa a,[arg1+0*SZ4]
+ vmovdqa b,[arg1+1*SZ4]
+ vmovdqa c,[arg1+2*SZ4]
+ vmovdqa d,[arg1+3*SZ4]
+ vmovdqa e,[arg1+4*SZ4]
+ vmovdqa f,[arg1+5*SZ4]
+ vmovdqa g,[arg1+6*SZ4]
+ vmovdqa h,[arg1+7*SZ4]
+
+ lea TBL,[TABLE]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ4], a
+ vmovdqa [rsp + _DIGEST + 1*SZ4], b
+ vmovdqa [rsp + _DIGEST + 2*SZ4], c
+ vmovdqa [rsp + _DIGEST + 3*SZ4], d
+ vmovdqa [rsp + _DIGEST + 4*SZ4], e
+ vmovdqa [rsp + _DIGEST + 5*SZ4], f
+ vmovdqa [rsp + _DIGEST + 6*SZ4], g
+ vmovdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPS TT2,[inp0+IDX+i*16]
+ VMOVPS TT1,[inp1+IDX+i*16]
+ VMOVPS TT4,[inp2+IDX+i*16]
+ VMOVPS TT3,[inp3+IDX+i*16]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+ vpshufb TT2, TT2, TMP
+ vpshufb TT3, TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddd a, a, [rsp + _DIGEST + 0*SZ4]
+ vpaddd b, b, [rsp + _DIGEST + 1*SZ4]
+ vpaddd c, c, [rsp + _DIGEST + 2*SZ4]
+ vpaddd d, d, [rsp + _DIGEST + 3*SZ4]
+ vpaddd e, e, [rsp + _DIGEST + 4*SZ4]
+ vpaddd f, f, [rsp + _DIGEST + 5*SZ4]
+ vpaddd g, g, [rsp + _DIGEST + 6*SZ4]
+ vpaddd h, h, [rsp + _DIGEST + 7*SZ4]
+
+
+ sub arg2, 1
+ jne lloop
+
+ ; write digests out
+ vmovdqa [arg1+0*SZ4],a
+ vmovdqa [arg1+1*SZ4],b
+ vmovdqa [arg1+2*SZ4],c
+ vmovdqa [arg1+3*SZ4],d
+ vmovdqa [arg1+4*SZ4],e
+ vmovdqa [arg1+5*SZ4],f
+ vmovdqa [arg1+6*SZ4],g
+ vmovdqa [arg1+7*SZ4],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [arg1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [arg1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [arg1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+ ret
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
new file mode 100644
index 000000000..e4fed5707
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x4_sse.asm
@@ -0,0 +1,423 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+default rel
+
+;; code to compute quad SHA256 using SSE
+;; Logic designed/laid out by JDG
+
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a3 a2 a1 a0}
+; r1 = {b3 b2 b1 b0}
+; r2 = {c3 c2 c1 c0}
+; r3 = {d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d0 c0 b0 a0}
+; r1 = {d1 c1 b1 a1}
+; r0 = {d2 c2 b2 a2}
+; r3 = {d3 c3 b3 a3}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ movaps %%t0, %%r0 ; t0 = {a3 a2 a1 a0}
+ shufps %%t0, %%r1, 0x44 ; t0 = {b1 b0 a1 a0}
+ shufps %%r0, %%r1, 0xEE ; r0 = {b3 b2 a3 a2}
+
+ movaps %%t1, %%r2 ; t1 = {c3 c2 c1 c0}
+ shufps %%t1, %%r3, 0x44 ; t1 = {d1 d0 c1 c0}
+ shufps %%r2, %%r3, 0xEE ; r2 = {d3 d2 c3 c2}
+
+ movaps %%r1, %%t0 ; r1 = {b1 b0 a1 a0}
+ shufps %%r1, %%t1, 0xDD ; r1 = {d1 c1 b1 a1}
+
+ movaps %%r3, %%r0 ; r3 = {b3 b2 a3 a2}
+ shufps %%r3, %%r2, 0xDD ; r3 = {d3 c3 b3 a3}
+
+ shufps %%r0, %%r2, 0x88 ; r0 = {d2 c2 b2 a2}
+ shufps %%t0, %%t1, 0x88 ; t0 = {d0 c0 b0 a0}
+%endmacro
+
+
+%define TABLE K256_4_MB
+%define SZ 4
+%define SZ4 4*SZ
+%define ROUNDS 64*SZ4
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psrld %%reg, %%imm
+ pslld %%tmp, (32-(%%imm))
+ por %%reg, %%tmp
+%endmacro
+
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+
+
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORD a0, (11-6) ; sig1: a0 = (e >> 5)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORD a1, 25 ; sig1: a1 = (e >> 25)
+ movdqa [SZ4*(%%i&0xf) + rsp],%%T1
+ paddd %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ paddd h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORD a2, (13-2) ; sig0: a2 = (a >> 11)
+ paddd h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORD a1, 22 ; sig0: a1 = (a >> 22)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddd h, a0
+
+ paddd d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddd h, a1 ; h = h + ch + W + K + maj
+ paddd h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+
+ movdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ movdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ movdqa a0, %%T1
+ PRORD %%T1, 18-7
+ movdqa a2, a1
+ PRORD a1, 19-17
+ pxor %%T1, a0
+ PRORD %%T1, 7
+ pxor a1, a2
+ PRORD a1, 17
+ psrld a0, 3
+ pxor %%T1, a0
+ psrld a2, 10
+ pxor a1, a2
+ paddd %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ paddd a1, [SZ4*((%%i-7)&0xf) + rsp]
+ paddd %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+%define DIGEST_SIZE 8*SZ4
+%define DATA 16*SZ4
+%define ALIGNMENT 1*8
+; ALIGNMENT makes FRAMESZ + pushes an odd multiple of 8
+%define FRAMESZ (DATA + DIGEST_SIZE + ALIGNMENT)
+%define _DIGEST (DATA)
+
+%define MOVPS movups
+
+%define inp0 r8
+%define inp1 r9
+%define inp2 r10
+%define inp3 r11
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define IDX rax
+%define ROUND rbx
+%define TBL r12
+
+;; void sha256_mb_x4_sse(SHA256_MB_ARGS_X8 *args, uint64_t len);
+;; arg 1 : pointer args (only 4 of the 8 lanes used)
+;; arg 2 : size of data in blocks (assumed >= 1)
+;;
+;; Clobbers registers: arg2, rax, rbx, r8-r12, xmm0-xmm15
+;;
+
+global sha256_mb_x4_sse:function internal
+align 32
+sha256_mb_x4_sse:
+ sub rsp, FRAMESZ
+
+ ;; Initialize digests
+ movdqa a,[arg1+0*SZ4]
+ movdqa b,[arg1+1*SZ4]
+ movdqa c,[arg1+2*SZ4]
+ movdqa d,[arg1+3*SZ4]
+ movdqa e,[arg1+4*SZ4]
+ movdqa f,[arg1+5*SZ4]
+ movdqa g,[arg1+6*SZ4]
+ movdqa h,[arg1+7*SZ4]
+
+ lea TBL,[TABLE]
+
+ ;; transpose input onto stack
+ mov inp0,[arg1 + _data_ptr + 0*8]
+ mov inp1,[arg1 + _data_ptr + 1*8]
+ mov inp2,[arg1 + _data_ptr + 2*8]
+ mov inp3,[arg1 + _data_ptr + 3*8]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ movdqa [rsp + _DIGEST + 0*SZ4], a
+ movdqa [rsp + _DIGEST + 1*SZ4], b
+ movdqa [rsp + _DIGEST + 2*SZ4], c
+ movdqa [rsp + _DIGEST + 3*SZ4], d
+ movdqa [rsp + _DIGEST + 4*SZ4], e
+ movdqa [rsp + _DIGEST + 5*SZ4], f
+ movdqa [rsp + _DIGEST + 6*SZ4], g
+ movdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ MOVPS TT2,[inp0+IDX+i*16]
+ MOVPS TT1,[inp1+IDX+i*16]
+ MOVPS TT4,[inp2+IDX+i*16]
+ MOVPS TT3,[inp3+IDX+i*16]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ pshufb TT0, TMP
+ pshufb TT1, TMP
+ pshufb TT2, TMP
+ pshufb TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ paddd a, [rsp + _DIGEST + 0*SZ4]
+ paddd b, [rsp + _DIGEST + 1*SZ4]
+ paddd c, [rsp + _DIGEST + 2*SZ4]
+ paddd d, [rsp + _DIGEST + 3*SZ4]
+ paddd e, [rsp + _DIGEST + 4*SZ4]
+ paddd f, [rsp + _DIGEST + 5*SZ4]
+ paddd g, [rsp + _DIGEST + 6*SZ4]
+ paddd h, [rsp + _DIGEST + 7*SZ4]
+
+
+ sub arg2, 1
+ jne lloop
+
+ ; write digests out
+ movdqa [arg1+0*SZ4],a
+ movdqa [arg1+1*SZ4],b
+ movdqa [arg1+2*SZ4],c
+ movdqa [arg1+3*SZ4],d
+ movdqa [arg1+4*SZ4],e
+ movdqa [arg1+5*SZ4],f
+ movdqa [arg1+6*SZ4],g
+ movdqa [arg1+7*SZ4],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [arg1 + _data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [arg1 + _data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [arg1 + _data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [arg1 + _data_ptr + 3*8], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, FRAMESZ
+ ret
+
+section .data align=64
+
+align 64
+TABLE:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
new file mode 100644
index 000000000..f7d11ed99
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_mb_x8_avx2.asm
@@ -0,0 +1,639 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha256_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+default rel
+
+;; code to compute oct SHA256 using SSE-256 / AVX2
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Windows clobbers: rax rbx rdx rsi rdi r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rbp r8
+;;
+;; Linux clobbers: rax rbx rcx rdx rsi r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdi rbp r8
+;;
+;; clobbers ymm0-15
+
+%ifidn __OUTPUT_FORMAT__, elf64
+ ; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+ %define reg3 rcx
+ %define reg4 rdx
+%else
+ ; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+ %define reg3 rsi
+ %define reg4 rdi
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL reg3
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 reg4
+
+; ymm0 a
+; ymm1 b
+; ymm2 c
+; ymm3 d
+; ymm4 e
+; ymm5 f
+; ymm6 g TMP0
+; ymm7 h TMP1
+; ymm8 T1 TT0
+; ymm9 TT1
+; ymm10 TT2
+; ymm11 TT3
+; ymm12 a0 TT4
+; ymm13 a1 TT5
+; ymm14 a2 TT6
+; ymm15 TMP TT7
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define T1 ymm8
+
+%define a0 ymm12
+%define a1 ymm13
+%define a2 ymm14
+%define TMP ymm15
+
+%define TMP0 ymm6
+%define TMP1 ymm7
+
+%define TT0 ymm8
+%define TT1 ymm9
+%define TT2 ymm10
+%define TT3 ymm11
+%define TT4 ymm12
+%define TT5 ymm13
+%define TT6 ymm14
+%define TT7 ymm15
+
+%define SZ8 8*SHA256_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 64*SZ8
+%define PTR_SZ 8
+%define SHA256_DIGEST_WORD_SIZE 4
+%define MAX_SHA256_LANES 8
+%define NUM_SHA256_DIGEST_WORDS 8
+%define SHA256_DIGEST_ROW_SIZE (MAX_SHA256_LANES * SHA256_DIGEST_WORD_SIZE)
+%define SHA256_DIGEST_SIZE (SHA256_DIGEST_ROW_SIZE * NUM_SHA256_DIGEST_WORDS)
+%define SHA256_BLK_SZ 64 ; in bytes
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+ .data resb 16*SZ8
+ .digest resb 8*SZ8
+ .ytmp resb 4*SZ8
+ .rsp resb 8
+endstruc
+%define FRAMESZ stack_frame_size
+%define _DIGEST stack_frame.digest
+%define _YTMP stack_frame.ytmp
+%define _RSP_SAVE stack_frame.rsp
+
+%define YTMP0 rsp + _YTMP + 0*SZ8
+%define YTMP1 rsp + _YTMP + 1*SZ8
+%define YTMP2 rsp + _YTMP + 2*SZ8
+%define YTMP3 rsp + _YTMP + 3*SZ8
+
+%define VMOVPS vmovups
+
+; TRANSPOSE8 r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+; "transpose" data in {r0...r7} using temps {t0...t1}
+; Input looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+;
+; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+;
+%macro TRANSPOSE8 10
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+ ; process top half (r0..r3) {a...d}
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+ vshufps %%r3, %%t0, %%t1, 0xDD ; r3 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vshufps %%r1, %%r0, %%r2, 0x88 ; r1 = {d6 c6 b6 a6 d2 c2 b2 a2}
+ vshufps %%r0, %%r0, %%r2, 0xDD ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+ vshufps %%t0, %%t0, %%t1, 0x88 ; t0 = {d4 c4 b4 a4 d0 c0 b0 a0}
+
+ ; use r2 in place of t0
+ ; process bottom half (r4..r7) {e...h}
+ vshufps %%r2, %%r4, %%r5, 0x44 ; r2 = {f5 f4 e5 e4 f1 f0 e1 e0}
+ vshufps %%r4, %%r4, %%r5, 0xEE ; r4 = {f7 f6 e7 e6 f3 f2 e3 e2}
+ vshufps %%t1, %%r6, %%r7, 0x44 ; t1 = {h5 h4 g5 g4 h1 h0 g1 g0}
+ vshufps %%r6, %%r6, %%r7, 0xEE ; r6 = {h7 h6 g7 g6 h3 h2 g3 g2}
+ vshufps %%r7, %%r2, %%t1, 0xDD ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vshufps %%r5, %%r4, %%r6, 0x88 ; r5 = {h6 g6 f6 e6 h2 g2 f2 e2}
+ vshufps %%r4, %%r4, %%r6, 0xDD ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+ vshufps %%t1, %%r2, %%t1, 0x88 ; t1 = {h4 g4 f4 e4 h0 g0 f0 e0}
+
+ vperm2f128 %%r6, %%r5, %%r1, 0x13 ; h6...a6
+ vperm2f128 %%r2, %%r5, %%r1, 0x02 ; h2...a2
+ vperm2f128 %%r5, %%r7, %%r3, 0x13 ; h5...a5
+ vperm2f128 %%r1, %%r7, %%r3, 0x02 ; h1...a1
+ vperm2f128 %%r7, %%r4, %%r0, 0x13 ; h7...a7
+ vperm2f128 %%r3, %%r4, %%r0, 0x02 ; h3...a3
+ vperm2f128 %%r4, %%t1, %%t0, 0x13 ; h4...a4
+ vperm2f128 %%r0, %%t1, %%t0, 0x02 ; h0...a0
+%endmacro
+
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORD reg, imm, tmp
+%macro PRORD 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpslld %%tmp, %%reg, (32-(%%imm))
+ vpsrld %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORD_nd reg, imm, tmp, src
+%macro PRORD_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpslld %%tmp, %%src, (32-(%%imm))
+ vpsrld %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORD dst/src, amt
+%macro PRORD 2
+ PRORD %1, %2, TMP
+%endmacro
+
+; PRORD_nd dst, src, amt
+%macro PRORD_nd 3
+ PRORD_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORD_nd a0, e, (11-6) ; sig1: a0 = (e >> 5)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORD_nd a1, e, 25 ; sig1: a1 = (e >> 25)
+ vmovdqa [SZ8*(%%i&0xf) + rsp], %%T1
+ vpaddd %%T1, %%T1, [TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORD a0, 6 ; sig1: a0 = (e >> 6) ^ (e >> 11)
+ vpaddd h, h, a2 ; h = h + ch
+ PRORD_nd a2, a, (13-2) ; sig0: a2 = (a >> 11)
+ vpaddd h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ PRORD_nd a1, a, 22 ; sig0: a1 = (a >> 22)
+ vpxor %%T1, a, c ; maj: T1 = a^c
+ add ROUND, SZ8 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddd h, h, a0
+
+ vpaddd d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORD a2, 2 ; sig0: a2 = (a >> 2) ^ (a >> 13)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddd h, h, a1 ; h = h + ch + W + K + maj
+ vpaddd h, h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ8*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ8*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORD %%T1, 18-7
+ vmovdqa a2, a1
+ PRORD a1, 19-17
+ vpxor %%T1, %%T1, a0
+ PRORD %%T1, 7
+ vpxor a1, a1, a2
+ PRORD a1, 17
+ vpsrld a0, a0, 3
+ vpxor %%T1, %%T1, a0
+ vpsrld a2, a2, 10
+ vpxor a1, a1, a2
+ vpaddd %%T1, %%T1, [SZ8*((%%i-16)&0xf) + rsp]
+ vpaddd a1, a1, [SZ8*((%%i-7)&0xf) + rsp]
+ vpaddd %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha256_x8_avx2(SHA256_ARGS *args, uint64_t bytes);
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE : size of input in blocks
+global sha256_mb_x8_avx2:function internal
+align 16
+sha256_mb_x8_avx2:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ ; save rsp, allocate 32-byte aligned for local variables
+ mov IDX, rsp
+ sub rsp, FRAMESZ
+ and rsp, ~31
+ mov [rsp + _RSP_SAVE], IDX
+
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqu a,[STATE + 0*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu b,[STATE + 1*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu c,[STATE + 2*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu d,[STATE + 3*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu e,[STATE + 4*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu f,[STATE + 5*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu g,[STATE + 6*SHA256_DIGEST_ROW_SIZE]
+ vmovdqu h,[STATE + 7*SHA256_DIGEST_ROW_SIZE]
+
+ lea TBL,[K256_8_MB]
+
+ ;; load the address of each of the 4 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _args_data_ptr + 0*PTR_SZ]
+ mov inp1,[STATE + _args_data_ptr + 1*PTR_SZ]
+ mov inp2,[STATE + _args_data_ptr + 2*PTR_SZ]
+ mov inp3,[STATE + _args_data_ptr + 3*PTR_SZ]
+ mov inp4,[STATE + _args_data_ptr + 4*PTR_SZ]
+ mov inp5,[STATE + _args_data_ptr + 5*PTR_SZ]
+ mov inp6,[STATE + _args_data_ptr + 6*PTR_SZ]
+ mov inp7,[STATE + _args_data_ptr + 7*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ8], a
+ vmovdqa [rsp + _DIGEST + 1*SZ8], b
+ vmovdqa [rsp + _DIGEST + 2*SZ8], c
+ vmovdqa [rsp + _DIGEST + 3*SZ8], d
+ vmovdqa [rsp + _DIGEST + 4*SZ8], e
+ vmovdqa [rsp + _DIGEST + 5*SZ8], f
+ vmovdqa [rsp + _DIGEST + 6*SZ8], g
+ vmovdqa [rsp + _DIGEST + 7*SZ8], h
+%assign i 0
+%rep 2
+ VMOVPS TT0,[inp0+IDX+i*32]
+ VMOVPS TT1,[inp1+IDX+i*32]
+ VMOVPS TT2,[inp2+IDX+i*32]
+ VMOVPS TT3,[inp3+IDX+i*32]
+ VMOVPS TT4,[inp4+IDX+i*32]
+ VMOVPS TT5,[inp5+IDX+i*32]
+ VMOVPS TT6,[inp6+IDX+i*32]
+ VMOVPS TT7,[inp7+IDX+i*32]
+ vmovdqa [YTMP0], g
+ vmovdqa [YTMP1], h
+ TRANSPOSE8 TT0, TT1, TT2, TT3, TT4, TT5, TT6, TT7, TMP0, TMP1
+ vmovdqa TMP1, [PSHUFFLE_BYTE_FLIP_MASK]
+ vmovdqa g, [YTMP0]
+ vpshufb TT0, TT0, TMP1
+ vpshufb TT1, TT1, TMP1
+ vpshufb TT2, TT2, TMP1
+ vpshufb TT3, TT3, TMP1
+ vpshufb TT4, TT4, TMP1
+ vpshufb TT5, TT5, TMP1
+ vpshufb TT6, TT6, TMP1
+ vpshufb TT7, TT7, TMP1
+ vmovdqa h, [YTMP1]
+ vmovdqa [YTMP0], TT4
+ vmovdqa [YTMP1], TT5
+ vmovdqa [YTMP2], TT6
+ vmovdqa [YTMP3], TT7
+ ROUND_00_15 TT0,(i*8+0)
+ vmovdqa TT0, [YTMP0]
+ ROUND_00_15 TT1,(i*8+1)
+ vmovdqa TT1, [YTMP1]
+ ROUND_00_15 TT2,(i*8+2)
+ vmovdqa TT2, [YTMP2]
+ ROUND_00_15 TT3,(i*8+3)
+ vmovdqa TT3, [YTMP3]
+ ROUND_00_15 TT0,(i*8+4)
+ ROUND_00_15 TT1,(i*8+5)
+ ROUND_00_15 TT2,(i*8+6)
+ ROUND_00_15 TT3,(i*8+7)
+%assign i (i+1)
+%endrep
+ add IDX, 4*4*4
+
+%assign i (i*8)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddd a, a, [rsp + _DIGEST + 0*SZ8]
+ vpaddd b, b, [rsp + _DIGEST + 1*SZ8]
+ vpaddd c, c, [rsp + _DIGEST + 2*SZ8]
+ vpaddd d, d, [rsp + _DIGEST + 3*SZ8]
+ vpaddd e, e, [rsp + _DIGEST + 4*SZ8]
+ vpaddd f, f, [rsp + _DIGEST + 5*SZ8]
+ vpaddd g, g, [rsp + _DIGEST + 6*SZ8]
+ vpaddd h, h, [rsp + _DIGEST + 7*SZ8]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqu [STATE + 0*SHA256_DIGEST_ROW_SIZE],a
+ vmovdqu [STATE + 1*SHA256_DIGEST_ROW_SIZE],b
+ vmovdqu [STATE + 2*SHA256_DIGEST_ROW_SIZE],c
+ vmovdqu [STATE + 3*SHA256_DIGEST_ROW_SIZE],d
+ vmovdqu [STATE + 4*SHA256_DIGEST_ROW_SIZE],e
+ vmovdqu [STATE + 5*SHA256_DIGEST_ROW_SIZE],f
+ vmovdqu [STATE + 6*SHA256_DIGEST_ROW_SIZE],g
+ vmovdqu [STATE + 7*SHA256_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _args_data_ptr + 0*8], inp0
+ add inp1, IDX
+ mov [STATE + _args_data_ptr + 1*8], inp1
+ add inp2, IDX
+ mov [STATE + _args_data_ptr + 2*8], inp2
+ add inp3, IDX
+ mov [STATE + _args_data_ptr + 3*8], inp3
+ add inp4, IDX
+ mov [STATE + _args_data_ptr + 4*8], inp4
+ add inp5, IDX
+ mov [STATE + _args_data_ptr + 5*8], inp5
+ add inp6, IDX
+ mov [STATE + _args_data_ptr + 6*8], inp6
+ add inp7, IDX
+ mov [STATE + _args_data_ptr + 7*8], inp7
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+ mov rsp, [rsp + _RSP_SAVE]
+ ret
+
+section .data
+align 64
+K256_8_MB:
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x428a2f98428a2f98, 0x428a2f98428a2f98
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0x7137449171374491, 0x7137449171374491
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x3956c25b3956c25b, 0x3956c25b3956c25b
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x59f111f159f111f1, 0x59f111f159f111f1
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0x923f82a4923f82a4, 0x923f82a4923f82a4
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0xd807aa98d807aa98, 0xd807aa98d807aa98
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x12835b0112835b01, 0x12835b0112835b01
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x243185be243185be, 0x243185be243185be
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x72be5d7472be5d74, 0x72be5d7472be5d74
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xc19bf174c19bf174, 0xc19bf174c19bf174
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0xefbe4786efbe4786, 0xefbe4786efbe4786
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x76f988da76f988da, 0x76f988da76f988da
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0x983e5152983e5152, 0x983e5152983e5152
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xa831c66da831c66d, 0xa831c66da831c66d
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xb00327c8b00327c8, 0xb00327c8b00327c8
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0xd5a79147d5a79147, 0xd5a79147d5a79147
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x06ca635106ca6351, 0x06ca635106ca6351
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x1429296714292967, 0x1429296714292967
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x27b70a8527b70a85, 0x27b70a8527b70a85
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x2e1b21382e1b2138, 0x2e1b21382e1b2138
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x53380d1353380d13, 0x53380d1353380d13
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x650a7354650a7354, 0x650a7354650a7354
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x766a0abb766a0abb, 0x766a0abb766a0abb
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0x92722c8592722c85, 0x92722c8592722c85
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xa81a664ba81a664b, 0xa81a664ba81a664b
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd192e819d192e819, 0xd192e819d192e819
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xd6990624d6990624, 0xd6990624d6990624
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0xf40e3585f40e3585, 0xf40e3585f40e3585
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x106aa070106aa070, 0x106aa070106aa070
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x19a4c11619a4c116, 0x19a4c11619a4c116
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x1e376c081e376c08, 0x1e376c081e376c08
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x2748774c2748774c, 0x2748774c2748774c
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x748f82ee748f82ee, 0x748f82ee748f82ee
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x78a5636f78a5636f, 0x78a5636f78a5636f
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x84c8781484c87814, 0x84c8781484c87814
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x8cc702088cc70208, 0x8cc702088cc70208
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0x90befffa90befffa, 0x90befffa90befffa
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xa4506ceba4506ceb, 0xa4506ceba4506ceb
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+ dq 0xc67178f2c67178f2, 0xc67178f2c67178f2
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+ dq 0x0405060700010203, 0x0c0d0e0f08090a0b
+
+align 64
+global K256_MB
+K256_MB:
+ dd 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+ dd 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+ dd 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+ dd 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+ dd 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+ dd 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+ dd 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+ dd 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+ dd 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+ dd 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+ dd 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+ dd 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+ dd 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+ dd 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+ dd 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+ dd 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
new file mode 100644
index 000000000..612fe8656
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_multibinary.asm
@@ -0,0 +1,84 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+default rel
+[bits 64]
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha256_ctx_mgr_init_sse
+extern sha256_ctx_mgr_submit_sse
+extern sha256_ctx_mgr_flush_sse
+
+extern sha256_ctx_mgr_init_avx
+extern sha256_ctx_mgr_submit_avx
+extern sha256_ctx_mgr_flush_avx
+
+extern sha256_ctx_mgr_init_avx2
+extern sha256_ctx_mgr_submit_avx2
+extern sha256_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha256_ctx_mgr_init_avx512
+ extern sha256_ctx_mgr_submit_avx512
+ extern sha256_ctx_mgr_flush_avx512
+%endif
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha256_ctx_mgr_init
+mbin_interface sha256_ctx_mgr_submit
+mbin_interface sha256_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6 through replacing base by sse version
+ mbin_dispatch_init6 sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2, sha256_ctx_mgr_init_avx512
+ mbin_dispatch_init6 sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2, sha256_ctx_mgr_submit_avx512
+ mbin_dispatch_init6 sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2, sha256_ctx_mgr_flush_avx512
+%else
+ mbin_dispatch_init sha256_ctx_mgr_init, sha256_ctx_mgr_init_sse, sha256_ctx_mgr_init_avx, sha256_ctx_mgr_init_avx2
+ mbin_dispatch_init sha256_ctx_mgr_submit, sha256_ctx_mgr_submit_sse, sha256_ctx_mgr_submit_avx, sha256_ctx_mgr_submit_avx2
+ mbin_dispatch_init sha256_ctx_mgr_flush, sha256_ctx_mgr_flush_sse, sha256_ctx_mgr_flush_avx, sha256_ctx_mgr_flush_avx2
+%endif
+
+
+;;; func core, ver, snum
+slversion sha256_ctx_mgr_init, 00, 03, 0160
+slversion sha256_ctx_mgr_submit, 00, 03, 0161
+slversion sha256_ctx_mgr_flush, 00, 03, 0162
diff --git a/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
new file mode 100644
index 000000000..2b23c3471
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha256_mb/sha256_ref.c
@@ -0,0 +1,212 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha256_mb.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA256 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define H0 0x6a09e667
+#define H1 0xbb67ae85
+#define H2 0x3c6ef372
+#define H3 0xa54ff53a
+#define H4 0x510e527f
+#define H5 0x9b05688c
+#define H6 0x1f83d9ab
+#define H7 0x5be0cd19
+
+#define ror32(x, r) (((x)>>(r)) ^ ((x)<<(32-(r))))
+#define bswap(x) (((x)<<24) | (((x)&0xff00)<<8) | (((x)&0xff0000)>>8) | ((x)>>24))
+
+#define W(x) w[(x) & 15]
+
+#define S0(w) (ror32(w,7) ^ ror32(w,18) ^ (w >> 3))
+#define S1(w) (ror32(w,17) ^ ror32(w,19) ^ (w >> 10))
+
+#define s0(a) (ror32(a,2) ^ ror32(a,13) ^ ror32(a,22))
+#define s1(e) (ror32(e,6) ^ ror32(e,11) ^ ror32(e,25))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = bswap(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+void sha256_single(const uint8_t * data, uint32_t digest[]);
+
+void sha256_ref(const uint8_t * input_data, uint32_t * digest, const uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA256_BLOCK_SIZE];
+ union {
+ uint64_t uint;
+ uint8_t uchar[8];
+ } convert;
+ uint8_t *p;
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+ digest[4] = H4;
+ digest[5] = H5;
+ digest[6] = H6;
+ digest[7] = H7;
+
+ i = len;
+ while (i >= SHA256_BLOCK_SIZE) {
+ sha256_single(input_data, digest);
+ input_data += SHA256_BLOCK_SIZE;
+ i -= SHA256_BLOCK_SIZE;
+ }
+
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+ for (j = i; j < ((2 * SHA256_BLOCK_SIZE) - SHA256_PADLENGTHFIELD_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA256_BLOCK_SIZE - SHA256_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA256_BLOCK_SIZE;
+ else
+ i = SHA256_BLOCK_SIZE;
+
+ convert.uint = 8 * len;
+ p = buf + i - 8;
+ p[0] = convert.uchar[7];
+ p[1] = convert.uchar[6];
+ p[2] = convert.uchar[5];
+ p[3] = convert.uchar[4];
+ p[4] = convert.uchar[3];
+ p[5] = convert.uchar[2];
+ p[6] = convert.uchar[1];
+ p[7] = convert.uchar[0];
+
+ sha256_single(buf, digest);
+ if (i == 2 * SHA256_BLOCK_SIZE)
+ sha256_single(buf + SHA256_BLOCK_SIZE, digest);
+}
+
+void sha256_single(const uint8_t * data, uint32_t digest[])
+{
+ uint32_t a, b, c, d, e, f, g, h, t1, t2;
+ uint32_t w[16];
+ uint32_t *ww = (uint32_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98);
+ step(1, h, a, b, c, d, e, f, g, 0x71374491);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcf);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba5);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25b);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b01);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a7);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c1);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc6);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dc);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c8);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf3);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351);
+ step(31, b, c, d, e, f, g, h, a, 0x14292967);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a85);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b2138);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d13);
+ step(36, e, f, g, h, a, b, c, d, 0x650a7354);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c85);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a1);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664b);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a3);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819);
+ step(45, d, e, f, g, h, a, b, c, 0xd6990624);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e3585);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa070);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c08);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774c);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4a);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc70208);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506ceb);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2);
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am
new file mode 100644
index 000000000..6fc22d132
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/Makefile.am
@@ -0,0 +1,91 @@
+########################################################################
+# Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# * Neither the name of Intel Corporation nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+########################################################################
+
+lsrc += sha512_mb/sha512_ctx_sse.c \
+ sha512_mb/sha512_ctx_avx.c \
+ sha512_mb/sha512_ctx_avx2.c \
+ sha512_mb/sha512_ctx_sb_sse4.c
+
+lsrc += sha512_mb/sha512_mb_mgr_init_sse.c \
+ sha512_mb/sha512_mb_mgr_init_avx2.c \
+ sha512_mb/sha512_sb_mgr_init_sse4.c
+
+
+lsrc += sha512_mb/sha512_mb_mgr_submit_sse.asm \
+ sha512_mb/sha512_mb_mgr_submit_avx.asm \
+ sha512_mb/sha512_mb_mgr_submit_avx2.asm \
+ sha512_mb/sha512_mb_mgr_flush_sse.asm \
+ sha512_mb/sha512_mb_mgr_flush_avx.asm \
+ sha512_mb/sha512_mb_mgr_flush_avx2.asm \
+ sha512_mb/sha512_mb_x2_sse.asm \
+ sha512_mb/sha512_mb_x2_avx.asm \
+ sha512_mb/sha512_mb_x4_avx2.asm \
+ sha512_mb/sha512_multibinary.asm \
+ sha512_mb/sha512_sb_mgr_submit_sse4.c \
+ sha512_mb/sha512_sb_mgr_flush_sse4.c \
+ sha512_mb/sha512_sse4.asm
+
+lsrc += sha512_mb/sha512_ctx_avx512.c \
+ sha512_mb/sha512_mb_mgr_init_avx512.c \
+ sha512_mb/sha512_mb_mgr_submit_avx512.asm \
+ sha512_mb/sha512_mb_mgr_flush_avx512.asm \
+ sha512_mb/sha512_mb_x8_avx512.asm
+
+extern_hdrs += include/sha512_mb.h \
+ include/multi_buffer.h
+
+other_src += include/datastruct.asm \
+ sha512_mb/sha512_job.asm \
+ sha512_mb/sha512_mb_mgr_datastruct.asm \
+ include/reg_sizes.asm \
+ sha512_mb/sha512_ref.c \
+ include/memcpy_inline.h \
+ include/memcpy.asm \
+ include/intrinreg.h
+
+check_tests += sha512_mb/sha512_mb_test \
+ sha512_mb/sha512_mb_rand_test \
+ sha512_mb/sha512_mb_rand_update_test
+
+unit_tests += sha512_mb/sha512_mb_rand_ssl_test
+
+perf_tests += sha512_mb/sha512_mb_vs_ossl_perf
+
+sha512_mb_rand_test: sha512_ref.o
+sha512_mb_sha512_mb_rand_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la
+
+sha512_mb_rand_update_test: sha512_ref.o
+sha512_mb_sha512_mb_rand_update_test_LDADD = sha512_mb/sha512_ref.lo libisal_crypto.la
+
+sha512_mb_rand_ssl_test: LDLIBS += -lcrypto
+sha512_mb_sha512_mb_rand_ssl_test_LDFLAGS = -lcrypto
+
+sha512_mb_vs_ossl_perf: LDLIBS += -lcrypto
+sha512_mb_sha512_mb_vs_ossl_perf_LDFLAGS = -lcrypto
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c
new file mode 100644
index 000000000..4e5173155
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_mb_mgr_init_avx(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx_slver_02020166;
+struct slver sha512_ctx_mgr_init_avx_slver = { 0x0166, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_submit_avx_slver_02020167;
+struct slver sha512_ctx_mgr_submit_avx_slver = { 0x0167, 0x02, 0x02 };
+
+struct slver sha512_ctx_mgr_flush_avx_slver_02020168;
+struct slver sha512_ctx_mgr_flush_avx_slver = { 0x0168, 0x02, 0x02 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c
new file mode 100644
index 000000000..d1b7d7270
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx2.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx2(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_mb_mgr_init_avx2(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx2(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx2(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx2(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx2(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx2_slver_04020169;
+struct slver sha512_ctx_mgr_init_avx2_slver = { 0x0169, 0x02, 0x04 };
+
+struct slver sha512_ctx_mgr_submit_avx2_slver_04020170;
+struct slver sha512_ctx_mgr_submit_avx2_slver = { 0x0170, 0x02, 0x04 };
+
+struct slver sha512_ctx_mgr_flush_avx2_slver_04020171;
+struct slver sha512_ctx_mgr_flush_avx2_slver = { 0x0171, 0x02, 0x04 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c
new file mode 100644
index 000000000..f99116eb1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_avx512.c
@@ -0,0 +1,259 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+#ifdef HAVE_AS_KNOWS_AVX512
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_avx512(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_mb_mgr_init_avx512(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_avx512(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_avx512(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_avx512(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx =
+ (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_avx512(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_avx512_slver_0600016a;
+struct slver sha512_ctx_mgr_init_avx512_slver = { 0x016a, 0x00, 0x06 };
+
+struct slver sha512_ctx_mgr_submit_avx512_slver_0600016b;
+struct slver sha512_ctx_mgr_submit_avx512_slver = { 0x016b, 0x00, 0x06 };
+
+struct slver sha512_ctx_mgr_flush_avx512_slver_0600016c;
+struct slver sha512_ctx_mgr_flush_avx512_slver = { 0x016c, 0x00, 0x06 };
+
+#endif // HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c
new file mode 100644
index 000000000..6b44f075c
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sb_sse4.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_sb_sse4(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_sb_mgr_init_sse4(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_sb_sse4(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx, const void *buffer,
+ uint32_t len, HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+ &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_sb_sse4(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_flush_sse4(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_sb_mgr_submit_sse4(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_sb_sse4_slver_05020172;
+struct slver sha512_ctx_mgr_init_sb_sse4_slver = { 0x0172, 0x02, 0x05 };
+
+struct slver sha512_ctx_mgr_submit_sb_sse4_slver_05020173;
+struct slver sha512_ctx_mgr_submit_sb_sse4_slver = { 0x0173, 0x02, 0x05 };
+
+struct slver sha512_ctx_mgr_flush_sb_sse4_slver_05020174;
+struct slver sha512_ctx_mgr_flush_sb_sse4_slver = { 0x0174, 0x02, 0x05 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c
new file mode 100644
index 000000000..b4dfe5332
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ctx_sse.c
@@ -0,0 +1,254 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+#include "memcpy_inline.h"
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define inline __inline
+#endif
+
+static inline void hash_init_digest(SHA512_WORD_T * digest);
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len);
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx);
+
+void sha512_ctx_mgr_init_sse(SHA512_HASH_CTX_MGR * mgr)
+{
+ sha512_mb_mgr_init_sse(&mgr->mgr);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_submit_sse(SHA512_HASH_CTX_MGR * mgr, SHA512_HASH_CTX * ctx,
+ const void *buffer, uint32_t len,
+ HASH_CTX_FLAG flags)
+{
+ if (flags & (~HASH_ENTIRE)) {
+ // User should not pass anything other than FIRST, UPDATE, or LAST
+ ctx->error = HASH_CTX_ERROR_INVALID_FLAGS;
+ return ctx;
+ }
+
+ if (ctx->status & HASH_CTX_STS_PROCESSING) {
+ // Cannot submit to a currently processing job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_PROCESSING;
+ return ctx;
+ }
+
+ if ((ctx->status & HASH_CTX_STS_COMPLETE) && !(flags & HASH_FIRST)) {
+ // Cannot update a finished job.
+ ctx->error = HASH_CTX_ERROR_ALREADY_COMPLETED;
+ return ctx;
+ }
+
+ if (flags & HASH_FIRST) {
+ // Init digest
+ hash_init_digest(ctx->job.result_digest);
+
+ // Reset byte counter
+ ctx->total_length = 0;
+
+ // Clear extra blocks
+ ctx->partial_block_buffer_length = 0;
+ }
+ // If we made it here, there were no errors during this call to submit
+ ctx->error = HASH_CTX_ERROR_NONE;
+
+ // Store buffer ptr info from user
+ ctx->incoming_buffer = buffer;
+ ctx->incoming_buffer_length = len;
+
+ // Store the user's request flags and mark this ctx as currently being processed.
+ ctx->status = (flags & HASH_LAST) ?
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_LAST) :
+ HASH_CTX_STS_PROCESSING;
+
+ // Advance byte counter
+ ctx->total_length += len;
+
+ // If there is anything currently buffered in the extra blocks, append to it until it contains a whole block.
+ // Or if the user's buffer contains less than a whole block, append as much as possible to the extra block.
+ if ((ctx->partial_block_buffer_length) | (len < SHA512_BLOCK_SIZE)) {
+ // Compute how many bytes to copy from user buffer into extra block
+ uint32_t copy_len = SHA512_BLOCK_SIZE - ctx->partial_block_buffer_length;
+ if (len < copy_len)
+ copy_len = len;
+
+ if (copy_len) {
+ // Copy and update relevant pointers and counters
+ memcpy_varlen(&ctx->partial_block_buffer
+ [ctx->partial_block_buffer_length], buffer, copy_len);
+
+ ctx->partial_block_buffer_length += copy_len;
+ ctx->incoming_buffer = (const void *)((const char *)buffer + copy_len);
+ ctx->incoming_buffer_length = len - copy_len;
+ }
+ // The extra block should never contain more than 1 block here
+ assert(ctx->partial_block_buffer_length <= SHA512_BLOCK_SIZE);
+
+ // If the extra block buffer contains exactly 1 block, it can be hashed.
+ if (ctx->partial_block_buffer_length >= SHA512_BLOCK_SIZE) {
+ ctx->partial_block_buffer_length = 0;
+
+ ctx->job.buffer = ctx->partial_block_buffer;
+ ctx->job.len = 1;
+
+ ctx =
+ (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr, &ctx->job);
+ }
+ }
+
+ return sha512_ctx_mgr_resubmit(mgr, ctx);
+}
+
+SHA512_HASH_CTX *sha512_ctx_mgr_flush_sse(SHA512_HASH_CTX_MGR * mgr)
+{
+ SHA512_HASH_CTX *ctx;
+
+ while (1) {
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_flush_sse(&mgr->mgr);
+
+ // If flush returned 0, there are no more jobs in flight.
+ if (!ctx)
+ return NULL;
+
+ // If flush returned a job, verify that it is safe to return to the user.
+ // If it is not ready, resubmit the job to finish processing.
+ ctx = sha512_ctx_mgr_resubmit(mgr, ctx);
+
+ // If sha512_ctx_mgr_resubmit returned a job, it is ready to be returned.
+ if (ctx)
+ return ctx;
+
+ // Otherwise, all jobs currently being managed by the SHA512_HASH_CTX_MGR still need processing. Loop.
+ }
+}
+
+static SHA512_HASH_CTX *sha512_ctx_mgr_resubmit(SHA512_HASH_CTX_MGR * mgr,
+ SHA512_HASH_CTX * ctx)
+{
+ while (ctx) {
+ if (ctx->status & HASH_CTX_STS_COMPLETE) {
+ ctx->status = HASH_CTX_STS_COMPLETE; // Clear PROCESSING bit
+ return ctx;
+ }
+ // If the extra blocks are empty, begin hashing what remains in the user's buffer.
+ if (ctx->partial_block_buffer_length == 0 && ctx->incoming_buffer_length) {
+ const void *buffer = ctx->incoming_buffer;
+ uint32_t len = ctx->incoming_buffer_length;
+
+ // Only entire blocks can be hashed. Copy remainder to extra blocks buffer.
+ uint32_t copy_len = len & (SHA512_BLOCK_SIZE - 1);
+
+ if (copy_len) {
+ len -= copy_len;
+ memcpy_varlen(ctx->partial_block_buffer,
+ ((const char *)buffer + len), copy_len);
+ ctx->partial_block_buffer_length = copy_len;
+ }
+
+ ctx->incoming_buffer_length = 0;
+
+ // len should be a multiple of the block size now
+ assert((len % SHA512_BLOCK_SIZE) == 0);
+
+ // Set len to the number of blocks to be hashed in the user's buffer
+ len >>= SHA512_LOG2_BLOCK_SIZE;
+
+ if (len) {
+ ctx->job.buffer = (uint8_t *) buffer;
+ ctx->job.len = len;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+ }
+ // If the extra blocks are not empty, then we are either on the last block(s)
+ // or we need more user input before continuing.
+ if (ctx->status & HASH_CTX_STS_LAST) {
+ uint8_t *buf = ctx->partial_block_buffer;
+ uint32_t n_extra_blocks = hash_pad(buf, ctx->total_length);
+
+ ctx->status =
+ (HASH_CTX_STS) (HASH_CTX_STS_PROCESSING | HASH_CTX_STS_COMPLETE);
+ ctx->job.buffer = buf;
+ ctx->job.len = (uint32_t) n_extra_blocks;
+ ctx = (SHA512_HASH_CTX *) sha512_mb_mgr_submit_sse(&mgr->mgr,
+ &ctx->job);
+ continue;
+ }
+
+ if (ctx)
+ ctx->status = HASH_CTX_STS_IDLE;
+ return ctx;
+ }
+
+ return NULL;
+}
+
+static inline void hash_init_digest(SHA512_WORD_T * digest)
+{
+ static const SHA512_WORD_T hash_initial_digest[SHA512_DIGEST_NWORDS] =
+ { SHA512_INITIAL_DIGEST };
+ memcpy_fixedlen(digest, hash_initial_digest, sizeof(hash_initial_digest));
+}
+
+static inline uint32_t hash_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2], uint32_t total_len)
+{
+ uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
+
+ memclr_fixedlen(&padblock[i], SHA512_BLOCK_SIZE);
+ padblock[i] = 0x80;
+
+ // Move i to the end of either 1st or 2nd extra block depending on length
+ i += ((SHA512_BLOCK_SIZE - 1) & (0 - (total_len + SHA512_PADLENGTHFIELD_SIZE + 1))) +
+ 1 + SHA512_PADLENGTHFIELD_SIZE;
+
+#if SHA512_PADLENGTHFIELD_SIZE == 16
+ *((uint64_t *) & padblock[i - 16]) = 0;
+#endif
+
+ *((uint64_t *) & padblock[i - 8]) = _byteswap_uint64((uint64_t) total_len << 3);
+
+ return i >> SHA512_LOG2_BLOCK_SIZE; // Number of extra blocks to hash
+}
+
+struct slver {
+ uint16_t snum;
+ uint8_t ver;
+ uint8_t core;
+};
+struct slver sha512_ctx_mgr_init_sse_slver_00020163;
+struct slver sha512_ctx_mgr_init_sse_slver = { 0x0163, 0x02, 0x00 };
+
+struct slver sha512_ctx_mgr_submit_sse_slver_00020164;
+struct slver sha512_ctx_mgr_submit_sse_slver = { 0x0164, 0x02, 0x00 };
+
+struct slver sha512_ctx_mgr_flush_sse_slver_00020165;
+struct slver sha512_ctx_mgr_flush_sse_slver = { 0x0165, 0x02, 0x00 };
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm
new file mode 100644
index 000000000..7f2bdae48
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_job.asm
@@ -0,0 +1,54 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define constants
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%define STS_UNKNOWN 0
+%define STS_BEING_PROCESSED 1
+%define STS_COMPLETED 2
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA512_JOB structure
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; JOB_SHA512
+
+;;; name size align
+FIELD _buffer, 8, 8 ; pointer to buffer
+FIELD _len, 8, 8 ; length in bytes
+FIELD _result_digest, 8*8, 64 ; Digest (output)
+FIELD _status, 4, 4
+FIELD _user_data, 8, 8
+
+%assign _SHA512_JOB_size _FIELD_OFFSET
+%assign _SHA512_JOB_align _STRUCT_ALIGN
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm
new file mode 100644
index 000000000..d1578109e
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_datastruct.asm
@@ -0,0 +1,72 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "datastruct.asm"
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;;; Define SHA512 Out Of Order Data Structures
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; LANE_DATA
+;;; name size align
+FIELD _job_in_lane, 8, 8 ; pointer to job object
+END_FIELDS
+
+%assign _LANE_DATA_size _FIELD_OFFSET
+%assign _LANE_DATA_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; SHA512_ARGS_X8
+;;; name size align
+FIELD _digest, 8*8*8, 4 ; transposed digest
+FIELD _data_ptr, 8*8, 8 ; array of pointers to data
+END_FIELDS
+
+%assign _SHA512_ARGS_X4_size _FIELD_OFFSET
+%assign _SHA512_ARGS_X4_align _STRUCT_ALIGN
+%assign _SHA512_ARGS_X8_size _FIELD_OFFSET
+%assign _SHA512_ARGS_X8_align _STRUCT_ALIGN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+START_FIELDS ; MB_MGR
+;;; name size align
+FIELD _args, _SHA512_ARGS_X4_size, _SHA512_ARGS_X4_align
+FIELD _lens, 8*8, 8
+FIELD _unused_lanes, 8, 8
+FIELD _ldata, _LANE_DATA_size*8, _LANE_DATA_align
+FIELD _num_lanes_inuse, 4, 4
+END_FIELDS
+
+%assign _MB_MGR_size _FIELD_OFFSET
+%assign _MB_MGR_align _STRUCT_ALIGN
+
+_args_digest equ _args + _digest
+_args_data_ptr equ _args + _data_ptr
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm
new file mode 100644
index 000000000..33c62773a
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx.asm
@@ -0,0 +1,218 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_avx
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha512_mb_mgr_flush_avx:function
+sha512_mb_mgr_flush_avx:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 16+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 2
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x2_avx
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*32]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*32]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*32]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*32]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm
new file mode 100644
index 000000000..61c25aaef
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx2.asm
@@ -0,0 +1,239 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x4_avx2
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*5
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.xmm
+%define _GPR_SAVE stack_frame.gpr
+%define STACK_SPACE stack_frame_size
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx2(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha512_mb_mgr_flush_avx2:function
+sha512_mb_mgr_flush_avx2:
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+ and rsp, ~31
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ vmovdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 32+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+ cmp qword [state + _ldata + 2 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [two]
+ cmp qword [state + _ldata + 3 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [three]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 4
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov lens2, [state + _lens + 2*8]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov lens3, [state + _lens + 3*8]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+ mov [state + _lens + 2*8], lens2
+ mov [state + _lens + 3*8], lens3
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x4_avx2
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*32]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*32]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*32]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*32]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm
new file mode 100644
index 000000000..c16517821
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_avx512.asm
@@ -0,0 +1,266 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern sha512_mb_x8_avx512
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define num_lanes_inuse r9
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define num_lanes_inuse r9
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*8
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.xmm
+%define _GPR_SAVE stack_frame.gpr
+%define STACK_SPACE stack_frame_size
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_avx512(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha512_mb_mgr_flush_avx512:function
+sha512_mb_mgr_flush_avx512:
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*3], rbp
+ mov [rsp + _GPR_SAVE + 8*4], r12
+ mov [rsp + _GPR_SAVE + 8*5], r13
+ mov [rsp + _GPR_SAVE + 8*6], r14
+ mov [rsp + _GPR_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*1], rsi
+ mov [rsp + _GPR_SAVE + 8*2], rdi
+ vmovdqu [rsp + _XMM_SAVE + 16*0], xmm6
+ vmovdqu [rsp + _XMM_SAVE + 16*1], xmm7
+ vmovdqu [rsp + _XMM_SAVE + 16*2], xmm8
+ vmovdqu [rsp + _XMM_SAVE + 16*3], xmm9
+ vmovdqu [rsp + _XMM_SAVE + 16*4], xmm10
+ vmovdqu [rsp + _XMM_SAVE + 16*5], xmm11
+ vmovdqu [rsp + _XMM_SAVE + 16*6], xmm12
+ vmovdqu [rsp + _XMM_SAVE + 16*7], xmm13
+ vmovdqu [rsp + _XMM_SAVE + 16*8], xmm14
+ vmovdqu [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ cmp num_lanes_inuse, 0
+ jz return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+%assign I 1
+%rep 7
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [APPEND(lane_,I)]
+%assign I (I+1)
+%endrep
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 8
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx
+ vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a}
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i}
+ vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i}
+ vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword
+
+ vmovq idx, xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 32 ; SHA512 blocksize is 1024bit
+ jz len_is_0
+
+ vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i}
+ vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0}
+ vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0}
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x8_avx512
+ ; state and idx are intact
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*64]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*64]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*64]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*64]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + _XMM_SAVE + 16*0]
+ vmovdqu xmm7, [rsp + _XMM_SAVE + 16*1]
+ vmovdqu xmm8, [rsp + _XMM_SAVE + 16*2]
+ vmovdqu xmm9, [rsp + _XMM_SAVE + 16*3]
+ vmovdqu xmm10, [rsp + _XMM_SAVE + 16*4]
+ vmovdqu xmm11, [rsp + _XMM_SAVE + 16*5]
+ vmovdqu xmm12, [rsp + _XMM_SAVE + 16*6]
+ vmovdqu xmm13, [rsp + _XMM_SAVE + 16*7]
+ vmovdqu xmm14, [rsp + _XMM_SAVE + 16*8]
+ vmovdqu xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*1]
+ mov rdi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov rbp, [rsp + _GPR_SAVE + 8*3]
+ mov r12, [rsp + _GPR_SAVE + 8*4]
+ mov r13, [rsp + _GPR_SAVE + 8*5]
+ mov r14, [rsp + _GPR_SAVE + 8*6]
+ mov r15, [rsp + _GPR_SAVE + 8*7]
+
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=32
+
+align 32
+clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index
+ dq 0xFFFFFFFF00000000, 0x0000000000000000
+ dq 0xFFFFFFFF00000000, 0x0000000000000000
+lane_1: dq 1
+lane_2: dq 2
+lane_3: dq 3
+lane_4: dq 4
+lane_5: dq 5
+lane_6: dq 6
+lane_7: dq 7
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_mgr_flush_avx512
+no_sha512_mb_mgr_flush_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm
new file mode 100644
index 000000000..602d95330
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_flush_sse.asm
@@ -0,0 +1,221 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_sse
+default rel
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rsi
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+
+%define unused_lanes rbx
+%define lane_data rbx
+%define tmp2 rbx
+
+%define job_rax rax
+%define tmp1 rax
+%define size_offset rax
+%define tmp rax
+%define start_offset rax
+
+%define tmp3 arg1
+
+%define extra_blocks arg2
+%define p arg2
+
+%define tmp4 r8
+%define lens0 r8
+
+%define lens1 r9
+%define lens2 r10
+%define lens3 r11
+
+; STACK_SPACE needs to be an odd multiple of 8
+_XMM_SAVE_SIZE equ 10*16
+_GPR_SAVE_SIZE equ 8*3
+_ALIGN_SIZE equ 0
+
+_XMM_SAVE equ 0
+_GPR_SAVE equ _XMM_SAVE + _XMM_SAVE_SIZE
+STACK_SPACE equ _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
+
+%define APPEND(a,b) a %+ b
+
+; SHA512_JOB* sha512_mb_mgr_flush_sse(SHA512_MB_JOB_MGR *state)
+; arg 1 : rcx : state
+global sha512_mb_mgr_flush_sse:function
+sha512_mb_mgr_flush_sse:
+
+ sub rsp, STACK_SPACE
+ mov [rsp + _GPR_SAVE + 8*0], rbx
+ mov [rsp + _GPR_SAVE + 8*1], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _GPR_SAVE + 8*2], rsi
+ movdqa [rsp + _XMM_SAVE + 16*0], xmm6
+ movdqa [rsp + _XMM_SAVE + 16*1], xmm7
+ movdqa [rsp + _XMM_SAVE + 16*2], xmm8
+ movdqa [rsp + _XMM_SAVE + 16*3], xmm9
+ movdqa [rsp + _XMM_SAVE + 16*4], xmm10
+ movdqa [rsp + _XMM_SAVE + 16*5], xmm11
+ movdqa [rsp + _XMM_SAVE + 16*6], xmm12
+ movdqa [rsp + _XMM_SAVE + 16*7], xmm13
+ movdqa [rsp + _XMM_SAVE + 16*8], xmm14
+ movdqa [rsp + _XMM_SAVE + 16*9], xmm15
+%endif
+
+
+ mov unused_lanes, [state + _unused_lanes]
+ bt unused_lanes, 16+7
+ jc return_null
+
+ ; find a lane with a non-null job
+ xor idx, idx
+ cmp qword [state + _ldata + 1 * _LANE_DATA_size + _job_in_lane], 0
+ cmovne idx, [one]
+
+ ; copy idx to empty lanes
+copy_lane_data:
+ mov tmp, [state + _args + _data_ptr + 8*idx]
+
+%assign I 0
+%rep 2
+ cmp qword [state + _ldata + I * _LANE_DATA_size + _job_in_lane], 0
+ jne APPEND(skip_,I)
+ mov [state + _args + _data_ptr + 8*I], tmp
+ mov dword [state + _lens + 4 + 8*I], 0xFFFFFFFF
+APPEND(skip_,I):
+%assign I (I+1)
+%endrep
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x2_sse
+ ; state and idx are intact
+
+
+len_is_0:
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ mov unused_lanes, [state + _unused_lanes]
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ movq xmm0, [state + _args_digest + 8*idx + 0*32]
+ pinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ movq xmm1, [state + _args_digest + 8*idx + 2*32]
+ pinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ movq xmm2, [state + _args_digest + 8*idx + 4*32]
+ pinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ movq xmm3, [state + _args_digest + 8*idx + 6*32]
+ pinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+ movdqa [job_rax + _result_digest + 2*16], xmm2
+ movdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + _XMM_SAVE + 16*0]
+ movdqa xmm7, [rsp + _XMM_SAVE + 16*1]
+ movdqa xmm8, [rsp + _XMM_SAVE + 16*2]
+ movdqa xmm9, [rsp + _XMM_SAVE + 16*3]
+ movdqa xmm10, [rsp + _XMM_SAVE + 16*4]
+ movdqa xmm11, [rsp + _XMM_SAVE + 16*5]
+ movdqa xmm12, [rsp + _XMM_SAVE + 16*6]
+ movdqa xmm13, [rsp + _XMM_SAVE + 16*7]
+ movdqa xmm14, [rsp + _XMM_SAVE + 16*8]
+ movdqa xmm15, [rsp + _XMM_SAVE + 16*9]
+ mov rsi, [rsp + _GPR_SAVE + 8*2]
+%endif
+ mov rbx, [rsp + _GPR_SAVE + 8*0]
+ mov r12, [rsp + _GPR_SAVE + 8*1]
+ add rsp, STACK_SPACE
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+one: dq 1
+two: dq 2
+three: dq 3
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c
new file mode 100644
index 000000000..da57e05d5
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx2.c
@@ -0,0 +1,44 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_avx2(SHA512_MB_JOB_MGR * state)
+{
+ unsigned int j;
+
+ state->lens[0] = 0;
+ state->lens[1] = 1;
+ state->lens[2] = 2;
+ state->lens[3] = 3;
+ state->unused_lanes = 0xFF03020100;
+ for (j = 0; j < SHA512_X4_LANES; j++) {
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c
new file mode 100644
index 000000000..2ce996cf1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_avx512.c
@@ -0,0 +1,42 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_avx512(SHA512_MB_JOB_MGR * state)
+{
+ unsigned int j;
+
+ state->unused_lanes = 0x0706050403020100;
+ state->num_lanes_inuse = 0;
+ for (j = 0; j < SHA512_MAX_LANES; j++) {
+ state->lens[j] = j; // sha512_mb uses low 32bit of lens to hold idx exclusively
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c
new file mode 100644
index 000000000..d646d88fd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_init_sse.c
@@ -0,0 +1,42 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+void sha512_mb_mgr_init_sse(SHA512_MB_JOB_MGR * state)
+{
+ unsigned int j;
+
+ state->lens[0] = 0;
+ state->lens[1] = 1;
+ state->unused_lanes = 0xFF0100;
+ for (j = 0; j < SHA512_MIN_LANES; j++) {
+ state->ldata[j].job_in_lane = 0;
+ }
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm
new file mode 100644
index 000000000..d9ef88474
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx.asm
@@ -0,0 +1,258 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_avx
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*5
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.gpr
+%define _GPR_SAVE stack_frame.rsp
+%define STACK_SPACE stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha512_mb_mgr_submit_avx:function
+sha512_mb_mgr_submit_avx:
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+ and rsp, ~31
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ vmovdqa [rsp + 16*7], xmm13
+ vmovdqa [rsp + 16*8], xmm14
+ vmovdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+ ; Load digest words from result_digest
+ vmovdqa xmm0, [job + _result_digest + 0*16]
+ vmovdqa xmm1, [job + _result_digest + 1*16]
+ vmovdqa xmm2, [job + _result_digest + 2*16]
+ vmovdqa xmm3, [job + _result_digest + 3*16]
+ vmovq [state + _args_digest + 8*lane + 0*32], xmm0
+ vpextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1
+ vmovq [state + _args_digest + 8*lane + 2*32], xmm1
+ vpextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1
+ vmovq [state + _args_digest + 8*lane + 4*32], xmm2
+ vpextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1
+ vmovq [state + _args_digest + 8*lane + 6*32], xmm3
+ vpextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xff
+ jne return_null
+
+start_loop:
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x2_avx
+ ; state and idx are intact
+
+len_is_0:
+
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*32]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*32]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*32]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*32]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ vmovdqa xmm13, [rsp + 16*7]
+ vmovdqa xmm14, [rsp + 16*8]
+ vmovdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm
new file mode 100644
index 000000000..e39b8df4d
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx2.asm
@@ -0,0 +1,266 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x4_avx2
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*5
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.gpr
+%define _GPR_SAVE stack_frame.rsp
+%define STACK_SPACE stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx2(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha512_mb_mgr_submit_avx2:function
+sha512_mb_mgr_submit_avx2:
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+ and rsp, ~31
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ vmovdqa [rsp + 16*0], xmm6
+ vmovdqa [rsp + 16*1], xmm7
+ vmovdqa [rsp + 16*2], xmm8
+ vmovdqa [rsp + 16*3], xmm9
+ vmovdqa [rsp + 16*4], xmm10
+ vmovdqa [rsp + 16*5], xmm11
+ vmovdqa [rsp + 16*6], xmm12
+ vmovdqa [rsp + 16*7], xmm13
+ vmovdqa [rsp + 16*8], xmm14
+ vmovdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+ ; Load digest words from result_digest
+ vmovdqa xmm0, [job + _result_digest + 0*16]
+ vmovdqa xmm1, [job + _result_digest + 1*16]
+ vmovdqa xmm2, [job + _result_digest + 2*16]
+ vmovdqa xmm3, [job + _result_digest + 3*16]
+ vmovq [state + _args_digest + 8*lane + 0*32], xmm0
+ vpextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1
+ vmovq [state + _args_digest + 8*lane + 2*32], xmm1
+ vpextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1
+ vmovq [state + _args_digest + 8*lane + 4*32], xmm2
+ vpextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1
+ vmovq [state + _args_digest + 8*lane + 6*32], xmm3
+ vpextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xff
+ jne return_null
+
+start_loop:
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+ mov lens2, [state + _lens + 2*8]
+ cmp lens2, idx
+ cmovb idx, lens2
+ mov lens3, [state + _lens + 3*8]
+ cmp lens3, idx
+ cmovb idx, lens3
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ sub lens2, len2
+ sub lens3, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+ mov [state + _lens + 2*8], lens2
+ mov [state + _lens + 3*8], lens3
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x4_avx2
+ ; state and idx are intact
+
+len_is_0:
+
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*32]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*32]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*32]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*32]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqa xmm6, [rsp + 16*0]
+ vmovdqa xmm7, [rsp + 16*1]
+ vmovdqa xmm8, [rsp + 16*2]
+ vmovdqa xmm9, [rsp + 16*3]
+ vmovdqa xmm10, [rsp + 16*4]
+ vmovdqa xmm11, [rsp + 16*5]
+ vmovdqa xmm12, [rsp + 16*6]
+ vmovdqa xmm13, [rsp + 16*7]
+ vmovdqa xmm14, [rsp + 16*8]
+ vmovdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm
new file mode 100644
index 000000000..59f359f1f
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_avx512.asm
@@ -0,0 +1,279 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+extern sha512_mb_x8_avx512
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define num_lanes_inuse r9
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*8
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.gpr
+%define _GPR_SAVE stack_frame.rsp
+%define STACK_SPACE stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_avx512(SHA512_MB_JOB_MGR *state, SHA512_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha512_mb_mgr_submit_avx512:function
+sha512_mb_mgr_submit_avx512:
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+ mov [rsp + _XMM_SAVE + 8*5], r13
+ mov [rsp + _XMM_SAVE + 8*6], r14
+ mov [rsp + _XMM_SAVE + 8*7], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ vmovdqu [rsp + 16*0], xmm6
+ vmovdqu [rsp + 16*1], xmm7
+ vmovdqu [rsp + 16*2], xmm8
+ vmovdqu [rsp + 16*3], xmm9
+ vmovdqu [rsp + 16*4], xmm10
+ vmovdqu [rsp + 16*5], xmm11
+ vmovdqu [rsp + 16*6], xmm12
+ vmovdqu [rsp + 16*7], xmm13
+ vmovdqu [rsp + 16*8], xmm14
+ vmovdqu [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4 + 8*lane], DWORD(len)
+
+
+ ; Load digest words from result_digest
+ vmovdqa xmm0, [job + _result_digest + 0*16]
+ vmovdqa xmm1, [job + _result_digest + 1*16]
+ vmovdqa xmm2, [job + _result_digest + 2*16]
+ vmovdqa xmm3, [job + _result_digest + 3*16]
+ vmovq [state + _args_digest + 8*lane + 0*64], xmm0
+ vpextrq [state + _args_digest + 8*lane + 1*64], xmm0, 1
+ vmovq [state + _args_digest + 8*lane + 2*64], xmm1
+ vpextrq [state + _args_digest + 8*lane + 3*64], xmm1, 1
+ vmovq [state + _args_digest + 8*lane + 4*64], xmm2
+ vpextrq [state + _args_digest + 8*lane + 5*64], xmm2, 1
+ vmovq [state + _args_digest + 8*lane + 6*64], xmm3
+ vpextrq [state + _args_digest + 8*lane + 7*64], xmm3, 1
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ add num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ cmp num_lanes_inuse, 8
+ jne return_null
+
+start_loop:
+ ; Find min length, len in sha512_mgr is 64bit, high 32bit is block num, low 8bit is idx
+ vmovdqu ymm0, [state + _lens + 0*32] ; ymm0 has {D,d,C,c,B,b,A,a}
+ vmovdqu ymm1, [state + _lens + 1*32]
+
+ vpminuq ymm2, ymm0, ymm1 ; ymm2 has {D,i,C,i,B,i,A,i}
+ vpalignr ymm3, ymm3, ymm2, 8 ; ymm3 has {x,i,D,i,x,i,B,i}
+ vpminuq ymm2, ymm2, ymm3 ; ymm2 has {x,i,F,i,x,i,E,i}
+ vperm2i128 ymm3, ymm2, ymm2, 1 ; ymm3 has {x,i,x,i,x,i,F,i}
+ vpminuq ymm2, ymm2, ymm3 ; ymm2 has min value in high dword
+
+ vmovq idx, xmm2
+ mov len2, idx
+ and idx, 0xF
+ shr len2, 32
+ jz len_is_0
+
+
+ vperm2i128 ymm2, ymm2, ymm2, 0 ; ymm2 has {x,x,E,i,x,x,E,i}
+ vpand ymm2, ymm2, [rel clear_low_nibble] ; ymm2 has {0,0,E,0,0,0,E,0}
+ vpshufd ymm2, ymm2, 0x44 ; ymm2 has {E,0,E,0,E,0,E,0}
+
+ vpsubd ymm0, ymm0, ymm2
+ vpsubd ymm1, ymm1, ymm2
+
+ vmovdqu [state + _lens + 0*32], ymm0
+ vmovdqu [state + _lens + 1*32], ymm1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x8_avx512
+ ; state and idx are intact
+
+len_is_0:
+
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ mov DWORD(num_lanes_inuse), [state + _num_lanes_inuse]
+ sub num_lanes_inuse, 1
+ mov [state + _num_lanes_inuse], DWORD(num_lanes_inuse)
+ vmovq xmm0, [state + _args_digest + 8*idx + 0*64]
+ vpinsrq xmm0, [state + _args_digest + 8*idx + 1*64], 1
+ vmovq xmm1, [state + _args_digest + 8*idx + 2*64]
+ vpinsrq xmm1, [state + _args_digest + 8*idx + 3*64], 1
+ vmovq xmm2, [state + _args_digest + 8*idx + 4*64]
+ vpinsrq xmm2, [state + _args_digest + 8*idx + 5*64], 1
+ vmovq xmm3, [state + _args_digest + 8*idx + 6*64]
+ vpinsrq xmm3, [state + _args_digest + 8*idx + 7*64], 1
+ vmovdqa [job_rax + _result_digest + 0*16], xmm0
+ vmovdqa [job_rax + _result_digest + 1*16], xmm1
+ vmovdqa [job_rax + _result_digest + 2*16], xmm2
+ vmovdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ vmovdqu xmm6, [rsp + 16*0]
+ vmovdqu xmm7, [rsp + 16*1]
+ vmovdqu xmm8, [rsp + 16*2]
+ vmovdqu xmm9, [rsp + 16*3]
+ vmovdqu xmm10, [rsp + 16*4]
+ vmovdqu xmm11, [rsp + 16*5]
+ vmovdqu xmm12, [rsp + 16*6]
+ vmovdqu xmm13, [rsp + 16*7]
+ vmovdqu xmm14, [rsp + 16*8]
+ vmovdqu xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov r13, [rsp + _XMM_SAVE + 8*5]
+ mov r14, [rsp + _XMM_SAVE + 8*6]
+ mov r15, [rsp + _XMM_SAVE + 8*7]
+
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=32
+
+align 32
+clear_low_nibble: ; mgr len element 0xnnnnnnnn 0000000m, nnnnnnnn is blocknum, m is index
+ dq 0xFFFFFFFF00000000, 0x0000000000000000
+ dq 0xFFFFFFFF00000000, 0x0000000000000000
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_mgr_submit_avx512
+no_sha512_mb_mgr_submit_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm
new file mode 100644
index 000000000..8b630a4da
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_mgr_submit_sse.asm
@@ -0,0 +1,256 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_job.asm"
+%include "sha512_mb_mgr_datastruct.asm"
+
+%include "reg_sizes.asm"
+
+extern sha512_mb_x2_sse
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux register definitions
+%define arg1 rdi ; rcx
+%define arg2 rsi ; rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define idx rdx ; rsi
+%define last_len rdx ; rsi
+
+%define size_offset rcx ; rdi
+%define tmp2 rcx ; rdi
+
+%else
+; WINDOWS register definitions
+%define arg1 rcx
+%define arg2 rdx
+
+; idx needs to be other than arg1, arg2, rbx, r12
+%define last_len rsi
+%define idx rsi
+
+%define size_offset rdi
+%define tmp2 rdi
+
+%endif
+
+; Common definitions
+%define state arg1
+%define job arg2
+%define len2 arg2
+%define p2 arg2
+
+%define p r11
+%define start_offset r11
+
+%define unused_lanes rbx
+
+%define job_rax rax
+%define len rax
+
+%define lane rbp
+%define tmp3 rbp
+%define lens3 rbp
+
+%define extra_blocks r8
+%define lens0 r8
+
+%define tmp r9
+%define lens1 r9
+
+%define lane_data r10
+%define lens2 r10
+
+struc stack_frame
+ .xmm: resb 16*10
+ .gpr: resb 8*5
+ .rsp: resb 8
+endstruc
+
+; STACK_SPACE needs to be an odd multiple of 8
+%define _XMM_SAVE stack_frame.gpr
+%define _GPR_SAVE stack_frame.rsp
+%define STACK_SPACE stack_frame_size
+
+; SHA512_JOB* sha512_mb_mgr_submit_sse(SHA512_MB_JOB_MGR *state, SHA256_JOB *job)
+; arg 1 : rcx : state
+; arg 2 : rdx : job
+global sha512_mb_mgr_submit_sse:function
+sha512_mb_mgr_submit_sse:
+
+ mov rax, rsp
+
+ sub rsp, STACK_SPACE
+ and rsp, ~31
+
+ mov [rsp + stack_frame.rsp], rax
+
+ mov [rsp + _XMM_SAVE + 8*0], rbx
+ mov [rsp + _XMM_SAVE + 8*1], rbp
+ mov [rsp + _XMM_SAVE + 8*2], r12
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + _XMM_SAVE + 8*3], rsi
+ mov [rsp + _XMM_SAVE + 8*4], rdi
+ movdqa [rsp + 16*0], xmm6
+ movdqa [rsp + 16*1], xmm7
+ movdqa [rsp + 16*2], xmm8
+ movdqa [rsp + 16*3], xmm9
+ movdqa [rsp + 16*4], xmm10
+ movdqa [rsp + 16*5], xmm11
+ movdqa [rsp + 16*6], xmm12
+ movdqa [rsp + 16*7], xmm13
+ movdqa [rsp + 16*8], xmm14
+ movdqa [rsp + 16*9], xmm15
+%endif
+
+ mov unused_lanes, [state + _unused_lanes]
+ movzx lane, BYTE(unused_lanes)
+ shr unused_lanes, 8
+ imul lane_data, lane, _LANE_DATA_size
+ mov dword [job + _status], STS_BEING_PROCESSED
+ lea lane_data, [state + _ldata + lane_data]
+ mov [state + _unused_lanes], unused_lanes
+ mov DWORD(len), [job + _len]
+
+ mov [lane_data + _job_in_lane], job
+ mov [state + _lens + 4 + 8*lane], DWORD(len)
+
+ ; Load digest words from result_digest
+ movdqa xmm0, [job + _result_digest + 0*16]
+ movdqa xmm1, [job + _result_digest + 1*16]
+ movdqa xmm2, [job + _result_digest + 2*16]
+ movdqa xmm3, [job + _result_digest + 3*16]
+ movq [state + _args_digest + 8*lane + 0*32], xmm0
+ pextrq [state + _args_digest + 8*lane + 1*32], xmm0, 1
+ movq [state + _args_digest + 8*lane + 2*32], xmm1
+ pextrq [state + _args_digest + 8*lane + 3*32], xmm1, 1
+ movq [state + _args_digest + 8*lane + 4*32], xmm2
+ pextrq [state + _args_digest + 8*lane + 5*32], xmm2, 1
+ movq [state + _args_digest + 8*lane + 6*32], xmm3
+ pextrq [state + _args_digest + 8*lane + 7*32], xmm3, 1
+
+ mov p, [job + _buffer]
+ mov [state + _args_data_ptr + 8*lane], p
+
+ cmp unused_lanes, 0xff
+ jne return_null
+
+start_loop:
+
+ ; Find min length
+ mov lens0, [state + _lens + 0*8]
+ mov idx, lens0
+ mov lens1, [state + _lens + 1*8]
+ cmp lens1, idx
+ cmovb idx, lens1
+
+ mov len2, idx
+ and idx, 0xF
+ and len2, ~0xFF
+ jz len_is_0
+
+ sub lens0, len2
+ sub lens1, len2
+ shr len2, 32
+ mov [state + _lens + 0*8], lens0
+ mov [state + _lens + 1*8], lens1
+
+ ; "state" and "args" are the same address, arg1
+ ; len is arg2
+ call sha512_mb_x2_sse
+ ; state and idx are intact
+
+len_is_0:
+
+ ; process completed job "idx"
+ imul lane_data, idx, _LANE_DATA_size
+ lea lane_data, [state + _ldata + lane_data]
+
+ mov job_rax, [lane_data + _job_in_lane]
+
+ mov unused_lanes, [state + _unused_lanes]
+ mov qword [lane_data + _job_in_lane], 0
+ mov dword [job_rax + _status], STS_COMPLETED
+ shl unused_lanes, 8
+ or unused_lanes, idx
+ mov [state + _unused_lanes], unused_lanes
+
+ movq xmm0, [state + _args_digest + 8*idx + 0*32]
+ pinsrq xmm0, [state + _args_digest + 8*idx + 1*32], 1
+ movq xmm1, [state + _args_digest + 8*idx + 2*32]
+ pinsrq xmm1, [state + _args_digest + 8*idx + 3*32], 1
+ movq xmm2, [state + _args_digest + 8*idx + 4*32]
+ pinsrq xmm2, [state + _args_digest + 8*idx + 5*32], 1
+ movq xmm3, [state + _args_digest + 8*idx + 6*32]
+ pinsrq xmm3, [state + _args_digest + 8*idx + 7*32], 1
+
+ movdqa [job_rax + _result_digest + 0*16], xmm0
+ movdqa [job_rax + _result_digest + 1*16], xmm1
+ movdqa [job_rax + _result_digest + 2*16], xmm2
+ movdqa [job_rax + _result_digest + 3*16], xmm3
+
+return:
+
+%ifidn __OUTPUT_FORMAT__, win64
+ movdqa xmm6, [rsp + 16*0]
+ movdqa xmm7, [rsp + 16*1]
+ movdqa xmm8, [rsp + 16*2]
+ movdqa xmm9, [rsp + 16*3]
+ movdqa xmm10, [rsp + 16*4]
+ movdqa xmm11, [rsp + 16*5]
+ movdqa xmm12, [rsp + 16*6]
+ movdqa xmm13, [rsp + 16*7]
+ movdqa xmm14, [rsp + 16*8]
+ movdqa xmm15, [rsp + 16*9]
+ mov rsi, [rsp + _XMM_SAVE + 8*3]
+ mov rdi, [rsp + _XMM_SAVE + 8*4]
+%endif
+ mov rbx, [rsp + _XMM_SAVE + 8*0]
+ mov rbp, [rsp + _XMM_SAVE + 8*1]
+ mov r12, [rsp + _XMM_SAVE + 8*2]
+ mov rsp, [rsp + stack_frame.rsp]
+
+ ret
+
+return_null:
+ xor job_rax, job_rax
+ jmp return
+
+section .data align=16
+
+align 16
+H0: dd 0x6a09e667
+H1: dd 0xbb67ae85
+H2: dd 0x3c6ef372
+H3: dd 0xa54ff53a
+H4: dd 0x510e527f
+H5: dd 0x9b05688c
+H6: dd 0x1f83d9ab
+H7: dd 0x5be0cd19
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c
new file mode 100644
index 000000000..edb57bc33
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_ssl_test.c
@@ -0,0 +1,171 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 200
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS];
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+uint64_t byteswap64(uint64_t x)
+{
+#if defined (__ICC)
+ return _bswap64(x);
+#elif defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+ return __builtin_bswap64(x);
+#else
+ return (((x & (0xffull << 0)) << 56)
+ | ((x & (0xffull << 8)) << 40)
+ | ((x & (0xffull << 16)) << 24)
+ | ((x & (0xffull << 24)) << 8)
+ | ((x & (0xffull << 32)) >> 8)
+ | ((x & (0xffull << 40)) >> 24)
+ | ((x & (0xffull << 48)) >> 40)
+ | ((x & (0xffull << 56)) >> 56));
+#endif
+}
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+
+ printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ sha512_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // SSL test
+ SHA512(bufs[i], TEST_LEN, digest_ssl[i]);
+
+ // sb_sha512 test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ byteswap64(((uint64_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ byteswap64(((uint64_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha512_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Ramdom buffer with ramdom len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run SSL test
+ SHA512(bufs[i], lens[i], digest_ssl[i]);
+
+ // Run sb_sha512 test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ byteswap64(((uint64_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ byteswap64(((uint64_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha512_ssl rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c
new file mode 100644
index 000000000..a1b805737
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_test.c
@@ -0,0 +1,197 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS];
+
+// Compare against reference function
+extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[TEST_BUFS];
+ uint32_t i, j, fail = 0;
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int jobs, t;
+ uint8_t *tmp_buf;
+
+ printf("multibinary_sha512 test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ sha512_ctx_mgr_init(mgr);
+
+ srand(TEST_SEED);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocate and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contexts
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha512_ref(bufs[i], digest_ref[i], TEST_LEN);
+
+ // Run sb_sha512 test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d "
+ "fail 0x%016lX <=> 0x%016lX \n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ sha512_ctx_mgr_init(mgr);
+
+ for (i = 0; i < jobs; i++) {
+ // Use buffer with random len and contents
+ lens[i] = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], lens[i]);
+
+ // Run reference test
+ sha512_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // Run sha512_mb test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail "
+ "0x%016lX <=> 0x%016lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ // Test at the end of buffer
+ jobs = rand() % TEST_BUFS;
+ tmp_buf = (uint8_t *) malloc(sizeof(uint8_t) * jobs);
+ if (!tmp_buf) {
+ printf("malloc failed, end test aborted.\n");
+ return 1;
+ }
+
+ rand_buffer(tmp_buf, jobs);
+
+ sha512_ctx_mgr_init(mgr);
+
+ // Extend to the end of allocated buffer to construct jobs
+ for (i = 0; i < jobs; i++) {
+ bufs[i] = (uint8_t *) & tmp_buf[i];
+ lens[i] = jobs - i;
+
+ // Reference test
+ sha512_ref(bufs[i], digest_ref[i], lens[i]);
+
+ // sb_sha512 test
+ sha512_ctx_mgr_submit(mgr, &ctxpool[i], bufs[i], lens[i], HASH_ENTIRE);
+ }
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("End test failed at offset %d - result: 0x%016lX"
+ ", ref: 0x%016lX\n", i, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+
+ putchar('.');
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha512 rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c
new file mode 100644
index 000000000..a05168b70
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_rand_update_test.c
@@ -0,0 +1,294 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "sha512_mb.h"
+
+#define TEST_LEN (1024*1024)
+#define TEST_BUFS 100
+#ifndef RANDOMS
+# define RANDOMS 10
+#endif
+#ifndef TEST_SEED
+# define TEST_SEED 0x1234
+#endif
+
+#define UPDATE_SIZE 13*SHA512_BLOCK_SIZE
+#define MAX_RAND_UPDATE_BLOCKS (TEST_LEN/(16*SHA512_BLOCK_SIZE))
+
+#ifdef DEBUG
+# define debug_char(x) putchar(x)
+#else
+# define debug_char(x) do {} while (0)
+#endif
+
+/* Reference digest global to reduce stack usage */
+static uint64_t digest_ref[TEST_BUFS][SHA512_DIGEST_NWORDS];
+
+extern void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len);
+
+// Generates pseudo-random data
+
+void rand_buffer(unsigned char *buf, const long buffer_size)
+{
+ long i;
+ for (i = 0; i < buffer_size; i++)
+ buf[i] = rand();
+}
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[TEST_BUFS], *ctx = NULL;
+ uint32_t i, j, fail = 0;
+ int len_done, len_rem, len_rand;
+ unsigned char *bufs[TEST_BUFS];
+ unsigned char *buf_ptr[TEST_BUFS];
+ uint32_t lens[TEST_BUFS];
+ unsigned int joblen, jobs, t;
+
+ printf("multibinary_sha512_update test, %d sets of %dx%d max: ", RANDOMS, TEST_BUFS,
+ TEST_LEN);
+
+ srand(TEST_SEED);
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ sha512_ctx_mgr_init(mgr);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ // Allocte and fill buffer
+ bufs[i] = (unsigned char *)malloc(TEST_LEN);
+ buf_ptr[i] = bufs[i];
+ if (bufs[i] == NULL) {
+ printf("malloc failed test aborted\n");
+ return 1;
+ }
+ rand_buffer(bufs[i], TEST_LEN);
+
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+
+ // Run reference test
+ sha512_ref(bufs[i], digest_ref[i], TEST_LEN);
+ }
+
+ // Run sb_sha512 tests
+ for (i = 0; i < TEST_BUFS;) {
+ len_done = (int)((unsigned long)buf_ptr[i] - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_done == 0)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_FIRST);
+ else if (len_rem <= UPDATE_SIZE)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ // Add jobs while available or finished
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha512_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha512_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] += UPDATE_SIZE;
+
+ len_done = (int)((unsigned long)buf_ptr[i]
+ - (unsigned long)bufs[i]);
+ len_rem = TEST_LEN - len_done;
+
+ if (len_rem <= UPDATE_SIZE)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], UPDATE_SIZE, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha512_ctx_mgr_flush(mgr);
+ }
+
+ // Check digests
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d fixed size, digest%d fail %8lX <=> %8lX",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ putchar('.');
+
+ // Run tests with random size and number of jobs
+ for (t = 0; t < RANDOMS; t++) {
+ jobs = rand() % (TEST_BUFS);
+
+ for (i = 0; i < jobs; i++) {
+ joblen = rand() % (TEST_LEN);
+ rand_buffer(bufs[i], joblen);
+ lens[i] = joblen;
+ buf_ptr[i] = bufs[i];
+ sha512_ref(bufs[i], digest_ref[i], lens[i]);
+ }
+
+ sha512_ctx_mgr_init(mgr);
+
+ // Run sha512_sb jobs
+ i = 0;
+ while (i < jobs) {
+ // Submit a new job
+ len_rand = SHA512_BLOCK_SIZE +
+ SHA512_BLOCK_SIZE * (rand() % MAX_RAND_UPDATE_BLOCKS);
+
+ if (lens[i] > len_rand)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_FIRST);
+ else
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], lens[i], HASH_ENTIRE);
+
+ // Returned ctx could be:
+ // - null context (we are just getting started and lanes aren't full yet), or
+ // - finished already (an ENTIRE we submitted or a previous LAST is returned), or
+ // - an unfinished ctx, we will resubmit
+
+ if ((ctx == NULL) || hash_ctx_complete(ctx)) {
+ i++;
+ continue;
+ } else {
+ // unfinished ctx returned, choose another random update length and submit either
+ // UPDATE or LAST depending on the amount of buffer remaining
+ while ((ctx != NULL) && !(hash_ctx_complete(ctx))) {
+ j = (unsigned long)(ctx->user_data); // Get index of the returned ctx
+ buf_ptr[j] = bufs[j] + ctx->total_length;
+ len_rand = (rand() % SHA512_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ len_rem = lens[j] - ctx->total_length;
+
+ if (len_rem <= len_rand) // submit the rest of the job as LAST
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rem,
+ HASH_LAST);
+ else // submit the random update length as UPDATE
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[j],
+ buf_ptr[j],
+ len_rand,
+ HASH_UPDATE);
+ } // Either continue submitting any contexts returned here as UPDATE/LAST, or
+ // go back to submitting new jobs using the index i.
+
+ i++;
+ }
+ }
+
+ // Start flushing finished jobs, end on last flushed
+ ctx = sha512_ctx_mgr_flush(mgr);
+ while (ctx) {
+ if (hash_ctx_complete(ctx)) {
+ debug_char('-');
+ ctx = sha512_ctx_mgr_flush(mgr);
+ continue;
+ }
+ // Resubmit unfinished job
+ i = (unsigned long)(ctx->user_data);
+ buf_ptr[i] = bufs[i] + ctx->total_length; // update buffer pointer
+ len_rem = lens[i] - ctx->total_length;
+ len_rand = (rand() % SHA512_BLOCK_SIZE)
+ * (rand() % MAX_RAND_UPDATE_BLOCKS);
+ debug_char('+');
+ if (len_rem <= len_rand)
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rem, HASH_LAST);
+ else
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ buf_ptr[i], len_rand, HASH_UPDATE);
+
+ if (ctx == NULL)
+ ctx = sha512_ctx_mgr_flush(mgr);
+ }
+
+ // Check result digest
+ for (i = 0; i < jobs; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] != digest_ref[i][j]) {
+ fail++;
+ printf("Test%d, digest%d fail %8lX <=> %8lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ digest_ref[i][j]);
+ }
+ }
+ }
+ if (fail) {
+ printf("Test failed function check %d\n", fail);
+ return fail;
+ }
+
+ putchar('.');
+ fflush(0);
+ } // random test t
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf(" multibinary_sha512_update rand: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c
new file mode 100644
index 000000000..747de43bb
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_test.c
@@ -0,0 +1,264 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "sha512_mb.h"
+
+typedef uint64_t DigestSHA512[SHA512_DIGEST_NWORDS];
+
+#define MSGS 8
+#define NUM_JOBS 1000
+
+#define PSEUDO_RANDOM_NUM(seed) ((seed) * 5 + ((seed) * (seed)) / 64) % MSGS
+
+static uint8_t msg1[] = "The quick brown fox jumps over the lazy dog";
+static uint8_t msg2[] = "The quick brown fox jumps over the lazy dog.";
+static uint8_t msg3[] = { 0x0a, 0x55, 0xdb, 0 };
+static uint8_t msg4[] = { 0xba, 0xd7, 0xc6, 0x18, 0xf4, 0x5b, 0xe2, 0x07, 0x97, 0x5e, 0 };
+
+static uint8_t msg5[] = {
+ 0xb1, 0x71, 0x5f, 0x78, 0x2f, 0xf0, 0x2c, 0x6b, 0x88, 0x93,
+ 0x7f, 0x05, 0x41, 0x16, 0
+};
+
+static uint8_t msg6[] = {
+ 0xc6, 0xa1, 0x70, 0x93, 0x65, 0x68, 0x65, 0x10, 0x20, 0xed,
+ 0xfe, 0x15, 0xdf, 0x80, 0x12, 0xac, 0xda, 0x8d, 0
+};
+
+static uint8_t msg7[] = {
+ 0xa8, 0xa3, 0x7d, 0xfc, 0x08, 0x3a, 0xd2, 0xf4, 0x7f, 0xff,
+ 0x46, 0x87, 0x38, 0xbf, 0x8b, 0x72, 0x8e, 0xb7, 0xf1, 0x90,
+ 0x7e, 0x42, 0x7f, 0xa1, 0x5c, 0xb4, 0x42, 0x4b, 0xc6, 0x85,
+ 0xe5, 0x5e, 0xd7, 0xb2, 0x82, 0x5c, 0x9c, 0x60, 0xb8, 0x39,
+ 0xcc, 0xc2, 0xfe, 0x5f, 0xb3, 0x3e, 0x36, 0xf5, 0x70, 0xcb,
+ 0x86, 0x61, 0x60, 0x9e, 0x63, 0x0b, 0xda, 0x05, 0xee, 0x64,
+ 0x1d, 0x93, 0x84, 0x28, 0x86, 0x7d, 0x90, 0xe0, 0x07, 0x44,
+ 0xa4, 0xaa, 0xd4, 0x94, 0xc9, 0x3c, 0x5f, 0x6d, 0x13, 0x27,
+ 0x87, 0x80, 0x78, 0x59, 0x0c, 0xdc, 0xe1, 0xe6, 0x47, 0xc9,
+ 0x82, 0x08, 0x18, 0xf4, 0x67, 0x64, 0x1f, 0xcd, 0x50, 0x8e,
+ 0x2f, 0x2e, 0xbf, 0xd0, 0xff, 0x3d, 0x4f, 0x27, 0x23, 0x93,
+ 0x47, 0x8f, 0x3b, 0x9e, 0x6f, 0x80, 0x6b, 0x43, 0
+};
+
+static uint8_t msg8[] = "";
+
+static DigestSHA512 expResultDigest1 = {
+ 0x07e547d9586f6a73, 0xf73fbac0435ed769, 0x51218fb7d0c8d788, 0xa309d785436bbb64,
+ 0x2e93a252a954f239, 0x12547d1e8a3b5ed6, 0xe1bfd7097821233f, 0xa0538f3db854fee6
+};
+
+static DigestSHA512 expResultDigest2 = {
+ 0x91ea1245f20d46ae, 0x9a037a989f54f1f7, 0x90f0a47607eeb8a1, 0x4d12890cea77a1bb,
+ 0xc6c7ed9cf205e67b, 0x7f2b8fd4c7dfd3a7, 0xa8617e45f3c463d4, 0x81c7e586c39ac1ed
+};
+
+static DigestSHA512 expResultDigest3 = {
+ 0x7952585e5330cb24, 0x7d72bae696fc8a6b, 0x0f7d0804577e347d, 0x99bc1b11e52f3849,
+ 0x85a428449382306a, 0x89261ae143c2f3fb, 0x613804ab20b42dc0, 0x97e5bf4a96ef919b
+};
+
+static DigestSHA512 expResultDigest4 = {
+ 0x5886828959d1f822, 0x54068be0bd14b6a8, 0x8f59f534061fb203, 0x76a0541052dd3635,
+ 0xedf3c6f0ca3d0877, 0x5e13525df9333a21, 0x13c0b2af76515887, 0x529910b6c793c8a5
+};
+
+static DigestSHA512 expResultDigest5 = {
+ 0xee1a56ee78182ec4, 0x1d2c3ab33d4c4187, 0x1d437c5c1ca060ee, 0x9e219cb83689b4e5,
+ 0xa4174dfdab5d1d10, 0x96a31a7c8d3abda7, 0x5c1b5e6da97e1814, 0x901c505b0bc07f25
+};
+
+static DigestSHA512 expResultDigest6 = {
+ 0xc36c100cdb6c8c45, 0xb072f18256d63a66, 0xc9843acb4d07de62, 0xe0600711d4fbe64c,
+ 0x8cf314ec3457c903, 0x08147cb7ac7e4d07, 0x3ba10f0ced78ea72, 0x4a474b32dae71231
+};
+
+static DigestSHA512 expResultDigest7 = {
+ 0x8e1c91729be8eb40, 0x226f6c58a029380e, 0xf7edb9dc166a5c3c, 0xdbcefe90bd30d85c,
+ 0xb7c4b248e66abf0a, 0x3a4c842281299bef, 0x6db88858d9e5ab52, 0x44f70b7969e1c072
+};
+
+static DigestSHA512 expResultDigest8 = {
+ 0Xcf83e1357eefb8bd, 0Xf1542850d66d8007, 0Xd620e4050b5715dc, 0X83f4a921d36ce9ce,
+ 0X47d0d13c5d85f2b0, 0Xff8318d2877eec2f, 0X63b931bd47417a81, 0Xa538327af927da3e
+};
+
+static uint8_t *msgs[MSGS] = { msg1, msg2, msg3, msg4, msg5, msg6, msg7, msg8 };
+
+static uint64_t *expResultDigest[MSGS] = { expResultDigest1, expResultDigest2,
+ expResultDigest3, expResultDigest4, expResultDigest5, expResultDigest6,
+ expResultDigest7, expResultDigest8
+};
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[NUM_JOBS], *ctx = NULL;
+ uint32_t i, j, k, t, checked = 0;
+ uint64_t *good;
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ sha512_ctx_mgr_init(mgr);
+
+ // Init contexts before first use
+ for (i = 0; i < MSGS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ for (i = 0; i < MSGS; i++) {
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[i], strlen((char *)msgs[i]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %016lX, "
+ "should be %016lX\n", t, j,
+ ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+ }
+ }
+
+ while (1) {
+ ctx = sha512_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ good = expResultDigest[t];
+ checked++;
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %016lX, "
+ "should be %016lX\n", t, j,
+ ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the "
+ "submit. Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ // do larger test in pseudo-random order
+
+ // Init contexts before first use
+ for (i = 0; i < NUM_JOBS; i++) {
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ checked = 0;
+ for (i = 0; i < NUM_JOBS; i++) {
+ j = PSEUDO_RANDOM_NUM(i);
+
+ ctx = sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i],
+ msgs[j], strlen((char *)msgs[j]), HASH_ENTIRE);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %016lX, "
+ "should be %016lX\n", t, j,
+ ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ }
+ }
+ while (1) {
+ ctx = sha512_ctx_mgr_flush(mgr);
+
+ if (ctx) {
+ t = (unsigned long)(ctx->user_data);
+ k = PSEUDO_RANDOM_NUM(t);
+ good = expResultDigest[k];
+ checked++;
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (good[j] != ctxpool[t].job.result_digest[j]) {
+ printf("Test %d, digest %d is %016lX, "
+ "should be %016lX\n", t, j,
+ ctxpool[t].job.result_digest[j], good[j]);
+ return -1;
+ }
+ }
+
+ if (ctx->error) {
+ printf("Something bad happened during the"
+ " submit. Error code: %d", ctx->error);
+ return -1;
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (checked != NUM_JOBS) {
+ printf("only tested %d rather than %d\n", checked, NUM_JOBS);
+ return -1;
+ }
+
+ printf(" multibinary_sha512 test: Pass\n");
+
+ return 0;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c
new file mode 100644
index 000000000..8af563068
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_vs_ossl_perf.c
@@ -0,0 +1,143 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <openssl/sha.h>
+#include "sha512_mb.h"
+#include "test.h"
+
+// Set number of outstanding jobs
+#define TEST_BUFS 32
+
+#ifdef CACHED_TEST
+// Loop many times over same data
+# define TEST_LEN 4*1024
+# define TEST_LOOPS 1000
+# define TEST_TYPE_STR "_warm"
+#else
+// Uncached test. Pull from large mem base.
+# define GT_L3_CACHE 32*1024*1024 /* some number > last level cache */
+# define TEST_LEN (GT_L3_CACHE / TEST_BUFS)
+# define TEST_LOOPS 10
+# define TEST_TYPE_STR "_cold"
+#endif
+
+#define TEST_MEM TEST_LEN * TEST_BUFS * TEST_LOOPS
+
+/* Reference digest global to reduce stack usage */
+static uint8_t digest_ssl[TEST_BUFS][8 * SHA512_DIGEST_NWORDS];
+
+inline uint64_t byteswap64(uint64_t x)
+{
+#if defined (__ICC)
+ return _bswap64(x);
+#elif defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+ return __builtin_bswap64(x);
+#else
+ return (((x & (0xffull << 0)) << 56)
+ | ((x & (0xffull << 8)) << 40)
+ | ((x & (0xffull << 16)) << 24)
+ | ((x & (0xffull << 24)) << 8)
+ | ((x & (0xffull << 32)) >> 8)
+ | ((x & (0xffull << 40)) >> 24)
+ | ((x & (0xffull << 48)) >> 40)
+ | ((x & (0xffull << 56)) >> 56));
+#endif
+}
+
+int main(void)
+{
+ SHA512_HASH_CTX_MGR *mgr = NULL;
+ SHA512_HASH_CTX ctxpool[TEST_BUFS];
+ unsigned char *bufs[TEST_BUFS];
+ uint32_t i, j, t, fail = 0;
+ struct perf start, stop;
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ bufs[i] = (unsigned char *)calloc((size_t) TEST_LEN, 1);
+ if (bufs[i] == NULL) {
+ printf("calloc failed test aborted\n");
+ return 1;
+ }
+ // Init ctx contents
+ hash_ctx_init(&ctxpool[i]);
+ ctxpool[i].user_data = (void *)((uint64_t) i);
+ }
+
+ posix_memalign((void *)&mgr, 16, sizeof(SHA512_HASH_CTX_MGR));
+ sha512_ctx_mgr_init(mgr);
+
+ // Start OpenSSL tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ SHA512(bufs[i], TEST_LEN, digest_ssl[i]);
+ }
+ perf_stop(&stop);
+
+ printf("sha512_openssl" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ // Start mb tests
+ perf_start(&start);
+ for (t = 0; t < TEST_LOOPS; t++) {
+ for (i = 0; i < TEST_BUFS; i++)
+ sha512_ctx_mgr_submit(mgr,
+ &ctxpool[i], bufs[i], TEST_LEN, HASH_ENTIRE);
+
+ while (sha512_ctx_mgr_flush(mgr)) ;
+ }
+ perf_stop(&stop);
+
+ printf("multibinary_sha512" TEST_TYPE_STR ": ");
+ perf_print(stop, start, (long long)TEST_LEN * i * t);
+
+ for (i = 0; i < TEST_BUFS; i++) {
+ for (j = 0; j < SHA512_DIGEST_NWORDS; j++) {
+ if (ctxpool[i].job.result_digest[j] !=
+ byteswap64(((uint64_t *) digest_ssl[i])[j])) {
+ fail++;
+ printf("Test%d, digest%d fail %016lX <=> %016lX\n",
+ i, j, ctxpool[i].job.result_digest[j],
+ byteswap64(((uint64_t *) digest_ssl[i])[j]));
+ }
+ }
+ }
+
+ printf("Multi-buffer sha512 test complete %d buffers of %d B with "
+ "%d iterations\n", TEST_BUFS, TEST_LEN, TEST_LOOPS);
+
+ if (fail)
+ printf("Test failed function check %d\n", fail);
+ else
+ printf("multibinary_sha512_ossl_perf: Pass\n");
+
+ return fail;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm
new file mode 100644
index 000000000..d1167dd49
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_avx.asm
@@ -0,0 +1,438 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+;; code to compute SHA512 by-2 using AVX
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux definitions
+%define arg1 rdi
+%define arg2 rsi
+%else
+; Windows definitions
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND r8
+%define TBL r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA: resb SZ2 * 16
+_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define VMOVPD vmovupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+ vshufpd %%t0, %%r0, %%r1, 11b ; t0 = b1 a1
+ vshufpd %%r0, %%r0, %%r1, 00b ; r0 = b0 a0
+%endm
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsllq %%tmp, %%reg, (64-(%%imm))
+ vpsrlq %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsllq %%tmp, %%src, (64-(%%imm))
+ vpsrlq %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+ PRORQ_nd %1, %3, TMP, %2
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41)
+ vmovdqa [SZ2*(%%i&0xf) + rsp + _DATA],%%T1
+ vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ vpaddq h, h, a2 ; h = h + ch
+ PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6)
+ vpaddq h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ vmovdqa %%T1, a ; maj: T1 = a
+ PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39)
+ vpxor %%T1, %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ2 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddq h, h, a0
+
+ vpaddq d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddq h, h, a1 ; h = h + ch + W + K + maj
+ vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+%endm
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp + _DATA]
+ vmovdqa a1, [SZ2*((%%i-2)&0xf) + rsp + _DATA]
+ vmovdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ vmovdqa a2, a1
+ PRORQ a1, 61-19
+ vpxor %%T1, %%T1, a0
+ PRORQ %%T1, 1
+ vpxor a1, a1, a2
+ PRORQ a1, 19
+ vpsrlq a0, a0, 7
+ vpxor %%T1, %%T1, a0
+ vpsrlq a2, a2, 6
+ vpxor a1, a1, a2
+ vpaddq %%T1, %%T1, [SZ2*((%%i-16)&0xf) + rsp + _DATA]
+ vpaddq a1, a1, [SZ2*((%%i-7)&0xf) + rsp + _DATA]
+ vpaddq %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+;; void sha512_mb_x2_avx(SHA512_MB_ARGS_X4 *args, uint64_t msg_size_in_blocks)
+;; arg 1 : STATE : pointer args (only 2 of the 4 lanes used)
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+global sha512_mb_x2_avx:function internal
+align 32
+sha512_mb_x2_avx:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+ vmovdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+ lea TBL,[K512_2_MB]
+
+ ;; load the address of each of the 2 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ2], a
+ vmovdqa [rsp + _DIGEST + 1*SZ2], b
+ vmovdqa [rsp + _DIGEST + 2*SZ2], c
+ vmovdqa [rsp + _DIGEST + 3*SZ2], d
+ vmovdqa [rsp + _DIGEST + 4*SZ2], e
+ vmovdqa [rsp + _DIGEST + 5*SZ2], f
+ vmovdqa [rsp + _DIGEST + 6*SZ2], g
+ vmovdqa [rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+ ;; load up the shuffler for little-endian to big-endian format
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+ VMOVPD TT2,[inp1+IDX+i*16]
+
+ TRANSPOSE TT0, TT2, TT1
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+
+ ROUND_00_15 TT0,(i*2+0)
+ ROUND_00_15 TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+ add IDX, 8 * 16
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddq a, a, [rsp + _DIGEST + 0*SZ2]
+ vpaddq b, b, [rsp + _DIGEST + 1*SZ2]
+ vpaddq c, c, [rsp + _DIGEST + 2*SZ2]
+ vpaddq d, d, [rsp + _DIGEST + 3*SZ2]
+ vpaddq e, e, [rsp + _DIGEST + 4*SZ2]
+ vpaddq f, f, [rsp + _DIGEST + 5*SZ2]
+ vpaddq g, g, [rsp + _DIGEST + 6*SZ2]
+ vpaddq h, h, [rsp + _DIGEST + 7*SZ2]
+
+ sub INP_SIZE, 1 ;; consumed one message block
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqa [STATE+0*SHA512_DIGEST_ROW_SIZE],a
+ vmovdqa [STATE+1*SHA512_DIGEST_ROW_SIZE],b
+ vmovdqa [STATE+2*SHA512_DIGEST_ROW_SIZE],c
+ vmovdqa [STATE+3*SHA512_DIGEST_ROW_SIZE],d
+ vmovdqa [STATE+4*SHA512_DIGEST_ROW_SIZE],e
+ vmovdqa [STATE+5*SHA512_DIGEST_ROW_SIZE],f
+ vmovdqa [STATE+6*SHA512_DIGEST_ROW_SIZE],g
+ vmovdqa [STATE+7*SHA512_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, STACK_size
+
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+section .data
+K512_2_MB:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817
+
+
+align 32
+; one from sha512_rorx
+; this does the big endian to little endian conversion
+; over a quad word
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm
new file mode 100644
index 000000000..f492021ae
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x2_sse.asm
@@ -0,0 +1,420 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+;; code to compute SHA512 by-2 using SSE
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 16 bytes before call
+;; Windows clobbers: rax rdx r8 r9 r10 r11
+;; Windows preserves: rbx rcx rsi rdi rbp r12 r13 r14 r15
+;;
+;; Linux clobbers: rax rsi r8 r9 r10 r11
+;; Linux preserves: rbx rcx rdx rdi rbp r12 r13 r14 r15
+;;
+;; clobbers xmm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; Linux definitions
+ %define arg1 rdi
+ %define arg2 rsi
+%else
+; Windows definitions
+ %define arg1 rcx
+ %define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND r8
+%define TBL r11
+
+%define inp0 r9
+%define inp1 r10
+
+%define a xmm0
+%define b xmm1
+%define c xmm2
+%define d xmm3
+%define e xmm4
+%define f xmm5
+%define g xmm6
+%define h xmm7
+
+%define a0 xmm8
+%define a1 xmm9
+%define a2 xmm10
+
+%define TT0 xmm14
+%define TT1 xmm13
+%define TT2 xmm12
+%define TT3 xmm11
+%define TT4 xmm10
+%define TT5 xmm9
+
+%define T1 xmm14
+%define TMP xmm15
+
+%define SZ2 2*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ2
+
+; Define stack usage
+
+struc STACK
+_DATA: resb SZ2 * 16
+_DIGEST: resb SZ2 * NUM_SHA512_DIGEST_WORDS
+ resb 8 ; for alignment, must be odd multiple of 8
+endstruc
+
+%define MOVPD movupd
+
+; transpose r0, r1, t0
+; Input looks like {r0 r1}
+; r0 = {a1 a0}
+; r1 = {b1 b0}
+;
+; output looks like
+; r0 = {b0, a0}
+; t0 = {b1, a1}
+
+%macro TRANSPOSE 3
+%define %%r0 %1
+%define %%r1 %2
+%define %%t0 %3
+ movapd %%t0, %%r0 ; t0 = a1 a0
+ shufpd %%r0, %%r1, 00b ; r0 = b0 a0
+ shufpd %%t0, %%r1, 11b ; t0 = b1 a1
+%endm
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ movdqa %%tmp, %%reg
+ psllq %%tmp, (64-(%%imm))
+ psrlq %%reg, %%imm
+ por %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ movdqa a0, e ; sig1: a0 = e
+ movdqa a1, e ; sig1: s1 = e
+ PRORQ a0, (18-14) ; sig1: a0 = (e >> 4)
+
+ movdqa a2, f ; ch: a2 = f
+ pxor a2, g ; ch: a2 = f^g
+ pand a2, e ; ch: a2 = (f^g)&e
+ pxor a2, g ; a2 = ch
+
+ PRORQ a1, 41 ; sig1: a1 = (e >> 41)
+ movdqa [SZ2*(%%i&0xf) + rsp],%%T1
+ paddq %%T1,[TBL + ROUND] ; T1 = W + K
+ pxor a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ paddq h, a2 ; h = h + ch
+ movdqa a2, a ; sig0: a2 = a
+ PRORQ a2, (34-28) ; sig0: a2 = (a >> 6)
+ paddq h, %%T1 ; h = h + ch + W + K
+ pxor a0, a1 ; a0 = sigma1
+ movdqa a1, a ; sig0: a1 = a
+ movdqa %%T1, a ; maj: T1 = a
+ PRORQ a1, 39 ; sig0: a1 = (a >> 39)
+ pxor %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ2 ; ROUND++
+ pand %%T1, b ; maj: T1 = (a^c)&b
+ paddq h, a0
+
+ paddq d, h
+
+ pxor a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ pxor a2, a1 ; a2 = sig0
+ movdqa a1, a ; maj: a1 = a
+ pand a1, c ; maj: a1 = a&c
+ por a1, %%T1 ; a1 = maj
+ paddq h, a1 ; h = h + ch + W + K + maj
+ paddq h, a2 ; h = h + ch + W + K + maj + sigma0
+
+ ROTATE_ARGS
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ movdqa %%T1, [SZ2*((%%i-15)&0xf) + rsp]
+ movdqa a1, [SZ2*((%%i-2)&0xf) + rsp]
+ movdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ movdqa a2, a1
+ PRORQ a1, 61-19
+ pxor %%T1, a0
+ PRORQ %%T1, 1
+ pxor a1, a2
+ PRORQ a1, 19
+ psrlq a0, 7
+ pxor %%T1, a0
+ psrlq a2, 6
+ pxor a1, a2
+ paddq %%T1, [SZ2*((%%i-16)&0xf) + rsp]
+ paddq a1, [SZ2*((%%i-7)&0xf) + rsp]
+ paddq %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+%endm
+
+;; void sha512_x2_sse(SHA512_MB_ARGS_X4 *args, uint64_t num_blocks);
+;; arg 1 : STATE : pointer args (only 2 of the 4 lanes used)
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+;;
+global sha512_mb_x2_sse:function internal
+align 32
+sha512_mb_x2_sse:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+ sub rsp, STACK_size
+
+ ;; Load the pre-transposed incoming digest.
+ movdqa a,[STATE + 0 * SHA512_DIGEST_ROW_SIZE]
+ movdqa b,[STATE + 1 * SHA512_DIGEST_ROW_SIZE]
+ movdqa c,[STATE + 2 * SHA512_DIGEST_ROW_SIZE]
+ movdqa d,[STATE + 3 * SHA512_DIGEST_ROW_SIZE]
+ movdqa e,[STATE + 4 * SHA512_DIGEST_ROW_SIZE]
+ movdqa f,[STATE + 5 * SHA512_DIGEST_ROW_SIZE]
+ movdqa g,[STATE + 6 * SHA512_DIGEST_ROW_SIZE]
+ movdqa h,[STATE + 7 * SHA512_DIGEST_ROW_SIZE]
+
+ lea TBL,[K512_2_MB]
+
+ ;; load the address of each of the 2 message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 +0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 +1*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+ ;; save old digest
+ movdqa [rsp + _DIGEST + 0*SZ2], a
+ movdqa [rsp + _DIGEST + 1*SZ2], b
+ movdqa [rsp + _DIGEST + 2*SZ2], c
+ movdqa [rsp + _DIGEST + 3*SZ2], d
+ movdqa [rsp + _DIGEST + 4*SZ2], e
+ movdqa [rsp + _DIGEST + 5*SZ2], f
+ movdqa [rsp + _DIGEST + 6*SZ2], g
+ movdqa [rsp + _DIGEST + 7*SZ2], h
+
+%assign i 0
+%rep 8
+ ;; load up the shuffler for little-endian to big-endian format
+ movdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ MOVPD TT0,[inp0+IDX+i*16] ;; double precision is 64 bits
+ MOVPD TT2,[inp1+IDX+i*16]
+ TRANSPOSE TT0, TT2, TT1
+ pshufb TT0, TMP
+ pshufb TT1, TMP
+ ROUND_00_15 TT0,(i*2+0)
+ ROUND_00_15 TT1,(i*2+1)
+%assign i (i+1)
+%endrep
+ add IDX, 8 * 16 ;; increment by a message block
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ paddq a, [rsp + _DIGEST + 0*SZ2]
+ paddq b, [rsp + _DIGEST + 1*SZ2]
+ paddq c, [rsp + _DIGEST + 2*SZ2]
+ paddq d, [rsp + _DIGEST + 3*SZ2]
+ paddq e, [rsp + _DIGEST + 4*SZ2]
+ paddq f, [rsp + _DIGEST + 5*SZ2]
+ paddq g, [rsp + _DIGEST + 6*SZ2]
+ paddq h, [rsp + _DIGEST + 7*SZ2]
+
+ sub INP_SIZE, 1 ;; unit is blocks
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ movdqa [STATE + 0*SHA512_DIGEST_ROW_SIZE],a
+ movdqa [STATE + 1*SHA512_DIGEST_ROW_SIZE],b
+ movdqa [STATE + 2*SHA512_DIGEST_ROW_SIZE],c
+ movdqa [STATE + 3*SHA512_DIGEST_ROW_SIZE],d
+ movdqa [STATE + 4*SHA512_DIGEST_ROW_SIZE],e
+ movdqa [STATE + 5*SHA512_DIGEST_ROW_SIZE],f
+ movdqa [STATE + 6*SHA512_DIGEST_ROW_SIZE],g
+ movdqa [STATE + 7*SHA512_DIGEST_ROW_SIZE],h
+
+ ; update input pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, STACK_size
+ ret
+
+section .data
+align 64
+global K512_2_MB:data internal
+K512_2_MB:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817
+
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm
new file mode 100644
index 000000000..6931bedc1
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x4_avx2.asm
@@ -0,0 +1,483 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+default rel
+
+;; code to compute quad SHA512 using AVX2
+;; use YMMs to tackle the larger digest size
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; ymm0-15
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx r8 r9 r10 r11 r12
+;; Windows preserves: rcx rsi rdi rbp r13 r14 r15
+;;
+;; Linux clobbers: rax rbx rcx rsi r8 r9 r10 r11 r12
+;; Linux preserves: rcx rdx rdi rbp r13 r14 r15
+;;
+;; clobbers ymm0-15
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*4
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%ifidn __OUTPUT_FORMAT__, elf64
+; LINUX register definitions
+%define arg1 rdi
+%define arg2 rsi
+%else
+; Windows register definitions
+%define arg1 rcx
+%define arg2 rdx
+%endif
+
+; Common definitions
+%define STATE arg1
+%define INP_SIZE arg2
+
+%define IDX rax
+%define ROUND rbx
+%define TBL r8
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+
+%define a ymm0
+%define b ymm1
+%define c ymm2
+%define d ymm3
+%define e ymm4
+%define f ymm5
+%define g ymm6
+%define h ymm7
+
+%define a0 ymm8
+%define a1 ymm9
+%define a2 ymm10
+
+%define TT0 ymm14
+%define TT1 ymm13
+%define TT2 ymm12
+%define TT3 ymm11
+%define TT4 ymm10
+%define TT5 ymm9
+
+%define T1 ymm14
+%define TMP ymm15
+
+%define SZ4 4*SHA512_DIGEST_WORD_SIZE ; Size of one vector register
+%define ROUNDS 80*SZ4
+
+; Define stack usage
+
+;; Assume stack aligned to 32 bytes before call
+;; Therefore FRAMESZ mod 32 must be 32-8 = 24
+struc stack_frame
+ .data resb 16*SZ4
+ .digest resb NUM_SHA512_DIGEST_WORDS*SZ4
+ .align resb 24
+endstruc
+
+%define _DIGEST stack_frame.digest
+
+%define VMOVPD vmovupd
+
+; operates on YMMs
+; transpose r0, r1, r2, r3, t0, t1
+; "transpose" data in {r0..r3} using temps {t0..t3}
+; Input looks like: {r0 r1 r2 r3}
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+;
+; output looks like: {t0 r1 r0 r3}
+; t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+; r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+; r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+; r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+;
+%macro TRANSPOSE 6
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%t0 %5
+%define %%t1 %6
+ ; vshufps does not cross the mid-way boundary and hence is cheaper
+ vshufps %%t0, %%r0, %%r1, 0x44 ; t0 = {b5 b4 a5 a4 b1 b0 a1 a0}
+ vshufps %%r0, %%r0, %%r1, 0xEE ; r0 = {b7 b6 a7 a6 b3 b2 a3 a2}
+
+ vshufps %%t1, %%r2, %%r3, 0x44 ; t1 = {d5 d4 c5 c4 d1 d0 c1 c0}
+ vshufps %%r2, %%r2, %%r3, 0xEE ; r2 = {d7 d6 c7 c6 d3 d2 c3 c2}
+
+ vperm2f128 %%r1, %%r0, %%r2, 0x20; r1 = {d3 d2 c3 c2 b3 b2 a3 a2}
+
+ vperm2f128 %%r3, %%r0, %%r2, 0x31; r3 = {d7 d6 c7 c6 b7 b6 a7 a6}
+
+ vperm2f128 %%r0, %%t0, %%t1, 0x31; r0 = {d5 d4 c5 c4 b5 b4 a5 a4}
+
+ ; now ok to clobber t0
+ vperm2f128 %%t0, %%t0, %%t1, 0x20; t0 = {d1 d0 c1 c0 b1 b0 a1 a0}
+
+%endmacro
+
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ h
+%xdefine h g
+%xdefine g f
+%xdefine f e
+%xdefine e d
+%xdefine d c
+%xdefine c b
+%xdefine b a
+%xdefine a TMP_
+%endm
+
+; PRORQ reg, imm, tmp
+; packed-rotate-right-double
+; does a rotate by doing two shifts and an or
+%macro PRORQ 3
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+ vpsllq %%tmp, %%reg, (64-(%%imm))
+ vpsrlq %%reg, %%reg, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; non-destructive
+; PRORQ_nd reg, imm, tmp, src
+%macro PRORQ_nd 4
+%define %%reg %1
+%define %%imm %2
+%define %%tmp %3
+%define %%src %4
+ vpsllq %%tmp, %%src, (64-(%%imm))
+ vpsrlq %%reg, %%src, %%imm
+ vpor %%reg, %%reg, %%tmp
+%endmacro
+
+; PRORQ dst/src, amt
+%macro PRORQ 2
+ PRORQ %1, %2, TMP
+%endmacro
+
+; PRORQ_nd dst, src, amt
+%macro PRORQ_nd 3
+ PRORQ_nd %1, %3, TMP, %2
+%endmacro
+
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_00_15 2
+%define %%T1 %1
+%define %%i %2
+ PRORQ_nd a0, e, (18-14) ; sig1: a0 = (e >> 4)
+
+ vpxor a2, f, g ; ch: a2 = f^g
+ vpand a2, a2, e ; ch: a2 = (f^g)&e
+ vpxor a2, a2, g ; a2 = ch
+
+ PRORQ_nd a1, e, 41 ; sig1: a1 = (e >> 41)
+ vmovdqa [SZ4*(%%i&0xf) + rsp],%%T1
+ vpaddq %%T1,%%T1,[TBL + ROUND] ; T1 = W + K
+ vpxor a0, a0, e ; sig1: a0 = e ^ (e >> 5)
+ PRORQ a0, 14 ; sig1: a0 = (e >> 14) ^ (e >> 18)
+ vpaddq h, h, a2 ; h = h + ch
+ PRORQ_nd a2, a, (34-28) ; sig0: a2 = (a >> 6)
+ vpaddq h, h, %%T1 ; h = h + ch + W + K
+ vpxor a0, a0, a1 ; a0 = sigma1
+ vmovdqa %%T1, a ; maj: T1 = a
+ PRORQ_nd a1, a, 39 ; sig0: a1 = (a >> 39)
+ vpxor %%T1, %%T1, c ; maj: T1 = a^c
+ add ROUND, SZ4 ; ROUND++
+ vpand %%T1, %%T1, b ; maj: T1 = (a^c)&b
+ vpaddq h, h, a0
+
+ vpaddq d, d, h
+
+ vpxor a2, a2, a ; sig0: a2 = a ^ (a >> 11)
+ PRORQ a2, 28 ; sig0: a2 = (a >> 28) ^ (a >> 34)
+ vpxor a2, a2, a1 ; a2 = sig0
+ vpand a1, a, c ; maj: a1 = a&c
+ vpor a1, a1, %%T1 ; a1 = maj
+ vpaddq h, h, a1 ; h = h + ch + W + K + maj
+ vpaddq h, h, a2 ; h = h + ch + W + K + maj + sigma0
+ ROTATE_ARGS
+
+%endm
+
+
+;; arguments passed implicitly in preprocessor symbols i, a...h
+%macro ROUND_16_XX 2
+%define %%T1 %1
+%define %%i %2
+ vmovdqa %%T1, [SZ4*((%%i-15)&0xf) + rsp]
+ vmovdqa a1, [SZ4*((%%i-2)&0xf) + rsp]
+ vmovdqa a0, %%T1
+ PRORQ %%T1, 8-1
+ vmovdqa a2, a1
+ PRORQ a1, 61-19
+ vpxor %%T1, %%T1, a0
+ PRORQ %%T1, 1
+ vpxor a1, a1, a2
+ PRORQ a1, 19
+ vpsrlq a0, a0, 7
+ vpxor %%T1, %%T1, a0
+ vpsrlq a2, a2, 6
+ vpxor a1, a1, a2
+ vpaddq %%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp]
+ vpaddq a1, a1, [SZ4*((%%i-7)&0xf) + rsp]
+ vpaddq %%T1, %%T1, a1
+
+ ROUND_00_15 %%T1, %%i
+
+%endm
+
+
+;; void sha512_mb_x4_avx2(SHA512_MB_ARGS_X4 *STATE, const int INP_SIZE)
+;; arg 1 : STATE : pointer to input data
+;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
+global sha512_mb_x4_avx2:function internal
+align 32
+sha512_mb_x4_avx2:
+ ; general registers preserved in outer calling routine
+ ; outer calling routine saves all the XMM registers
+
+ sub rsp, stack_frame_size
+
+ ;; Load the pre-transposed incoming digest.
+ vmovdqu a, [STATE+ 0*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu b, [STATE+ 1*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu c, [STATE+ 2*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu d, [STATE+ 3*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu e, [STATE+ 4*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu f, [STATE+ 5*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu g, [STATE+ 6*SHA512_DIGEST_ROW_SIZE]
+ vmovdqu h, [STATE+ 7*SHA512_DIGEST_ROW_SIZE]
+
+
+ lea TBL,[K512_4_MB]
+
+ ;; load the address of each of the MAX_LANES (4) message lanes
+ ;; getting ready to transpose input onto stack
+ mov inp0,[STATE + _data_ptr_sha512 + 0*PTR_SZ]
+ mov inp1,[STATE + _data_ptr_sha512 + 1*PTR_SZ]
+ mov inp2,[STATE + _data_ptr_sha512 + 2*PTR_SZ]
+ mov inp3,[STATE + _data_ptr_sha512 + 3*PTR_SZ]
+
+ xor IDX, IDX
+lloop:
+ xor ROUND, ROUND
+
+ ;; save old digest
+ vmovdqa [rsp + _DIGEST + 0*SZ4], a
+ vmovdqa [rsp + _DIGEST + 1*SZ4], b
+ vmovdqa [rsp + _DIGEST + 2*SZ4], c
+ vmovdqa [rsp + _DIGEST + 3*SZ4], d
+ vmovdqa [rsp + _DIGEST + 4*SZ4], e
+ vmovdqa [rsp + _DIGEST + 5*SZ4], f
+ vmovdqa [rsp + _DIGEST + 6*SZ4], g
+ vmovdqa [rsp + _DIGEST + 7*SZ4], h
+
+%assign i 0
+%rep 4
+ ;; load up the shuffler for little-endian to big-endian format
+ vmovdqa TMP, [PSHUFFLE_BYTE_FLIP_MASK]
+ VMOVPD TT2,[inp0+IDX+i*32]
+ VMOVPD TT1,[inp1+IDX+i*32]
+ VMOVPD TT4,[inp2+IDX+i*32]
+ VMOVPD TT3,[inp3+IDX+i*32]
+ TRANSPOSE TT2, TT1, TT4, TT3, TT0, TT5
+ vpshufb TT0, TT0, TMP
+ vpshufb TT1, TT1, TMP
+ vpshufb TT2, TT2, TMP
+ vpshufb TT3, TT3, TMP
+ ROUND_00_15 TT0,(i*4+0)
+ ROUND_00_15 TT1,(i*4+1)
+ ROUND_00_15 TT2,(i*4+2)
+ ROUND_00_15 TT3,(i*4+3)
+%assign i (i+1)
+%endrep
+;; Increment IDX by message block size == 8 (loop) * 16 (XMM width in bytes)
+ add IDX, 4 * 32
+
+%assign i (i*4)
+
+ jmp Lrounds_16_xx
+align 16
+Lrounds_16_xx:
+%rep 16
+ ROUND_16_XX T1, i
+%assign i (i+1)
+%endrep
+
+ cmp ROUND,ROUNDS
+ jb Lrounds_16_xx
+
+ ;; add old digest
+ vpaddq a, a, [rsp + _DIGEST + 0*SZ4]
+ vpaddq b, b, [rsp + _DIGEST + 1*SZ4]
+ vpaddq c, c, [rsp + _DIGEST + 2*SZ4]
+ vpaddq d, d, [rsp + _DIGEST + 3*SZ4]
+ vpaddq e, e, [rsp + _DIGEST + 4*SZ4]
+ vpaddq f, f, [rsp + _DIGEST + 5*SZ4]
+ vpaddq g, g, [rsp + _DIGEST + 6*SZ4]
+ vpaddq h, h, [rsp + _DIGEST + 7*SZ4]
+
+ sub INP_SIZE, 1 ;; consumed one message block
+ jne lloop
+
+ ; write back to memory (state object) the transposed digest
+ vmovdqu [STATE+ 0*SHA512_DIGEST_ROW_SIZE ],a
+ vmovdqu [STATE+ 1*SHA512_DIGEST_ROW_SIZE ],b
+ vmovdqu [STATE+ 2*SHA512_DIGEST_ROW_SIZE ],c
+ vmovdqu [STATE+ 3*SHA512_DIGEST_ROW_SIZE ],d
+ vmovdqu [STATE+ 4*SHA512_DIGEST_ROW_SIZE ],e
+ vmovdqu [STATE+ 5*SHA512_DIGEST_ROW_SIZE ],f
+ vmovdqu [STATE+ 6*SHA512_DIGEST_ROW_SIZE ],g
+ vmovdqu [STATE+ 7*SHA512_DIGEST_ROW_SIZE ],h
+
+ ;; update input data pointers
+ add inp0, IDX
+ mov [STATE + _data_ptr_sha512 + 0*PTR_SZ], inp0
+ add inp1, IDX
+ mov [STATE + _data_ptr_sha512 + 1*PTR_SZ], inp1
+ add inp2, IDX
+ mov [STATE + _data_ptr_sha512 + 2*PTR_SZ], inp2
+ add inp3, IDX
+ mov [STATE + _data_ptr_sha512 + 3*PTR_SZ], inp3
+
+ ;;;;;;;;;;;;;;;;
+ ;; Postamble
+
+ add rsp, stack_frame_size
+
+ ; outer calling routine restores XMM and other GP registers
+ ret
+
+section .data
+align 64
+K512_4_MB:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 32
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+
+
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm
new file mode 100644
index 000000000..cc8d85122
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_mb_x8_avx512.asm
@@ -0,0 +1,639 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "sha512_mb_mgr_datastruct.asm"
+%include "reg_sizes.asm"
+
+%ifdef HAVE_AS_KNOWS_AVX512
+default rel
+;; code to compute quad SHA512 using AVX512
+;; use ZMMs to tackle the larger digest size
+;; outer calling routine takes care of save and restore of XMM registers
+;; Logic designed/laid out by JDG
+
+;; Function clobbers: rax, rcx, rdx, rbx, rsi, rdi, r9-r15; zmm0-31
+;; Stack must be aligned to 32 bytes before call
+;; Windows clobbers: rax rbx rdx rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+;; Windows preserves: rcx rsi
+;;
+;; Linux clobbers: rax rbx rcx rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+;; Linux preserves: rdx rdi
+;;
+;; clobbers zmm0-31
+
+%define APPEND(a,b) a %+ b
+
+%ifidn __OUTPUT_FORMAT__, win64
+ %define arg1 rcx ; arg0 preserved
+ %define arg2 rdx ; arg1
+ %define reg3 r8 ; arg2 preserved
+ %define reg4 r9 ; arg3
+ %define var1 rdi ; usable
+ %define var2 rsi
+ %define local_func_decl(func_name) global func_name
+ %else
+ %define arg1 rdi ; arg0
+ %define arg2 rsi ; arg1
+ %define var2 rdx ; arg2
+ %define var1 rcx ; arg3 usable
+ %define local_func_decl(func_name) global func_name:function internal
+%endif
+
+%define state arg1
+%define num_blks arg2
+
+%define IN (state + _data_ptr)
+%define DIGEST state
+%define SIZE num_blks
+
+%define IDX var1
+%define TBL r8
+
+%define VMOVDQ32 vmovdqu32
+
+%define SHA512_DIGEST_WORD_SIZE 8
+%define NUM_SHA512_DIGEST_WORDS 8
+%define SHA512_DIGEST_ROW_SIZE 8*8
+%define PTR_SZ 8
+%define _data_ptr_sha512 _data_ptr
+
+%define NUM_LANES 8
+%define SZ 8
+%define SZ8 8 * SZ
+%define DIGEST_SZ 8 * SZ8
+%define DIGEST_SAVE NUM_LANES * DIGEST_SZ
+%define RSP_SAVE 1*8
+
+; Define Stack Layout
+START_FIELDS
+;;; name size align
+FIELD _DIGEST_SAVE, NUM_LANES*8*64, 64
+FIELD _RSP, 8, 8
+%assign STACK_SPACE _FIELD_OFFSET
+
+
+%define inp0 r9
+%define inp1 r10
+%define inp2 r11
+%define inp3 r12
+%define inp4 r13
+%define inp5 r14
+%define inp6 r15
+%define inp7 rax
+
+%define A zmm0
+%define B zmm1
+%define C zmm2
+%define D zmm3
+%define E zmm4
+%define F zmm5
+%define G zmm6
+%define H zmm7
+%define T1 zmm8
+%define TMP0 zmm9
+%define TMP1 zmm10
+%define TMP2 zmm11
+%define TMP3 zmm12
+%define TMP4 zmm13
+%define TMP5 zmm14
+%define TMP6 zmm15
+
+
+%define W0 zmm16
+%define W1 zmm17
+%define W2 zmm18
+%define W3 zmm19
+%define W4 zmm20
+%define W5 zmm21
+%define W6 zmm22
+%define W7 zmm23
+%define W8 zmm24
+%define W9 zmm25
+%define W10 zmm26
+%define W11 zmm27
+%define W12 zmm28
+%define W13 zmm29
+%define W14 zmm30
+%define W15 zmm31
+
+; from sha256_fips180-2.pdf
+; define rotates for Sigma function for main loop steps
+%define BIG_SIGMA_0_0 28 ; Sigma0
+%define BIG_SIGMA_0_1 34
+%define BIG_SIGMA_0_2 39
+%define BIG_SIGMA_1_0 14 ; Sigma1
+%define BIG_SIGMA_1_1 18
+%define BIG_SIGMA_1_2 41
+
+; define rotates for Sigma function for scheduling steps
+
+%define SMALL_SIGMA_0_0 1 ; sigma0
+%define SMALL_SIGMA_0_1 8
+%define SMALL_SIGMA_0_2 7
+%define SMALL_SIGMA_1_0 19 ; sigma1
+%define SMALL_SIGMA_1_1 61
+%define SMALL_SIGMA_1_2 6
+
+%define SHA_MAX_ROUNDS 80
+%define SHA_ROUNDS_LESS_16 (SHA_MAX_ROUNDS - 16)
+
+%macro TRANSPOSE8 12
+%define %%r0 %1
+%define %%r1 %2
+%define %%r2 %3
+%define %%r3 %4
+%define %%r4 %5
+%define %%r5 %6
+%define %%r6 %7
+%define %%r7 %8
+%define %%t0 %9
+%define %%t1 %10
+%define %%PERM_INDEX1 %11
+%define %%PERM_INDEX2 %12
+
+
+; each x(i) is 32 bits, 16 * 32 = 512 ==> a full digest length, 32 single precision quantities
+; r0 = {a7 a6 a5 a4 a3 a2 a1 a0}
+; r1 = {b7 b6 b5 b4 b3 b2 b1 b0}
+; r2 = {c7 c6 c5 c4 c3 c2 c1 c0}
+; r3 = {d7 d6 d5 d4 d3 d2 d1 d0}
+; r4 = {e7 e6 e5 e4 e3 e2 e1 e0}
+; r5 = {f7 f6 f5 f4 f3 f2 f1 f0}
+; r6 = {g7 g6 g5 g4 g3 g2 g1 g0}
+; r7 = {h7 h6 h5 h4 h3 h2 h1 h0}
+
+ ;; ;;; will not get clobbered
+ vmovdqa32 %%PERM_INDEX1, [TRANSPOSE8_PERM_INDEX_1] ; temp
+ vmovdqa32 %%PERM_INDEX2, [TRANSPOSE8_PERM_INDEX_2] ; temp
+
+ ; process top half (r0..r3) {a...d}
+ vshufpd %%t0, %%r0, %%r1, 0x00 ; t0 = {b6 a6 b4 a4 b2 a2 b0 a0}
+ vshufpd %%r0, %%r0, %%r1, 0xFF ; r0 = {b7 a7 b5 a5 b3 a3 b1 a1}
+ vshufpd %%t1, %%r2, %%r3, 0x00 ; t1 = {d6 c6 d4 c4 d2 c2 d0 c0}
+ vshufpd %%r2, %%r2, %%r3, 0xFF ; r2 = {d7 c7 d5 c5 d3 c3 d1 c1}
+
+ vmovdqa32 %%r1, %%t0 ; r1 and r3 free
+ vpermt2q %%r1, %%PERM_INDEX1,%%t1 ; r1 = {d4 c4 b4 a4 d0 c0 b0 a0}
+ vpermt2q %%t0, %%PERM_INDEX2,%%t1 ; t0 = {d6 c6 b6 a6 d2 c2 b2 a2}
+
+ vmovdqa32 %%t1, %%r0 ; t1 and r3 free
+ vpermt2q %%t1, %%PERM_INDEX1,%%r2 ; t1 = {d5 c5 b5 a5 d1 c1 b1 a1}
+ vpermt2q %%r0, %%PERM_INDEX2,%%r2 ; r0 = {d7 c7 b7 a7 d3 c3 b3 a3}
+
+ ;; Likewise for top half ; r2 and r3 free
+ vshufpd %%r2, %%r4, %%r5, 0x00 ; r2 = {f6 e6 f4 e4 f2 e2 f0 e0}
+ vshufpd %%r4, %%r4, %%r5, 0xFF ; r4 = {f7 e7 f5 e5 f3 e3 f1 e1}
+ vshufpd %%r3, %%r6, %%r7, 0x00 ; r3 = {h6 g6 h4 g4 h2 g2 h0 g0}
+ vshufpd %%r6, %%r6, %%r7, 0xFF ; r6 = {h7 g7 h5 g5 h3 g3 h1 g1}
+
+ vmovdqa32 %%r5, %%r2 ; r5 and r7 free
+ vpermt2q %%r5, %%PERM_INDEX1,%%r3 ; r5 = {h4 g4 f4 e4 h0 g0 f0 e0}
+ vpermt2q %%r2, %%PERM_INDEX2,%%r3 ; r2 = {h6 g6 f6 e6 h2 g2 f2 e2}
+
+ vmovdqa32 %%r7, %%r4
+ vpermt2q %%r7, %%PERM_INDEX1,%%r6 ; r7 = {h5 g5 f5 e5 h1 g1 f1 e1}
+ vpermt2q %%r4, %%PERM_INDEX2,%%r6 ; r4 = {h7 g7 f7 e7 h3 g3 f3 e3}
+
+;;; free r3, r6
+ vshuff64x2 %%r6, %%t0, %%r2, 0xEE ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+ vshuff64x2 %%r2, %%t0, %%r2, 0x44 ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+
+;;; t0 and r3 free
+ vshuff64x2 %%r3, %%r0, %%r4, 0x44 ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+ vshuff64x2 %%t0, %%r0, %%r4, 0xEE ; t0 = {h7 g7 f7 e7 d7 c7 b7 a7}
+
+ vshuff64x2 %%r4, %%r1, %%r5, 0xEE ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+ vshuff64x2 %%r0, %%r1, %%r5, 0x44 ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+
+
+ vshuff64x2 %%r5, %%t1, %%r7, 0xEE ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+ vshuff64x2 %%r1, %%t1, %%r7, 0x44 ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+
+ ;; will re-order input to avoid move
+ ;vmovdqa32 %%r7, %%t0
+
+ ; Output looks like: {r0 r1 r2 r3 r4 r5 r6 r7}
+ ; r0 = {h0 g0 f0 e0 d0 c0 b0 a0}
+ ; r1 = {h1 g1 f1 e1 d1 c1 b1 a1}
+ ; r2 = {h2 g2 f2 e2 d2 c2 b2 a2}
+ ; r3 = {h3 g3 f3 e3 d3 c3 b3 a3}
+ ; r4 = {h4 g4 f4 e4 d4 c4 b4 a4}
+ ; r5 = {h5 g5 f5 e5 d5 c5 b5 a5}
+ ; r6 = {h6 g6 f6 e6 d6 c6 b6 a6}
+ ; temp
+ ; r7 = {h7 g7 f7 e7 d7 c7 b7 a7}
+%endmacro
+
+%macro ROTATE_ARGS 0
+%xdefine TMP_ H
+%xdefine H G
+%xdefine G F
+%xdefine F E
+%xdefine E D
+%xdefine D C
+%xdefine C B
+%xdefine B A
+%xdefine A TMP_
+%endm
+
+
+
+;; CH(A, B, C) = (A&B) ^ (~A&C)
+;; MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
+;; SIGMA0 = ROR_28 ^ ROR_34 ^ ROR_39
+;; SIGMA1 = ROR_14 ^ ROR_18 ^ ROR_41
+;; sigma0 = ROR_1 ^ ROR_8 ^ SHR_7
+;; sigma1 = ROR_19 ^ ROR_61 ^ SHR_6
+
+;; Main processing loop per round
+;; equivalent to %macro ROUND_00_15 2
+%macro PROCESS_LOOP 2
+%define %%WT %1
+%define %%ROUND %2
+ ;; T1 = H + BIG_SIGMA_1(E) + CH(E, F, G) + Kt + Wt
+ ;; T2 = BIG_SIGMA_0(A) + MAJ(A, B, C)
+ ;; H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
+
+ ;; H becomes T2, then add T1 for A
+ ;; D becomes D + T1 for E
+
+ vpaddq T1, H, TMP3 ; T1 = H + Kt
+ vmovdqa32 TMP0, E
+ ;; compute BIG_SIGMA_1(E)
+ vprorq TMP1, E, BIG_SIGMA_1_0 ; ROR_14(E)
+ vprorq TMP2, E, BIG_SIGMA_1_1 ; ROR_18(E)
+ vprorq TMP3, E, BIG_SIGMA_1_2 ; ROR_41(E)
+ vpternlogq TMP1, TMP2, TMP3, 0x96 ; TMP1 = BIG_SIGMA_1(E)
+ vpternlogq TMP0, F, G, 0xCA ; TMP0 = CH(E,F,G)
+ vpaddq T1, T1, %%WT ; T1 = T1 + Wt
+ vpaddq T1, T1, TMP0 ; T1 = T1 + CH(E,F,G)
+ vpaddq T1, T1, TMP1 ; T1 = T1 + BIG_SIGMA_1(E)
+ vpaddq D, D, T1 ; D = D + T1
+ vprorq H, A, BIG_SIGMA_0_0 ;ROR_28(A)
+ vprorq TMP2, A, BIG_SIGMA_0_1 ;ROR_34(A)
+ vprorq TMP3, A, BIG_SIGMA_0_2 ;ROR_39(A)
+ vmovdqa32 TMP0, A
+ vpternlogq TMP0, B, C, 0xE8 ; TMP0 = MAJ(A,B,C)
+ vpternlogq H, TMP2, TMP3, 0x96 ; H(T2) = BIG_SIGMA_0(A)
+ vpaddq H, H, TMP0 ; H(T2) = BIG_SIGMA_0(A) + MAJ(A,B,C)
+ vpaddq H, H, T1 ; H(A) = H(T2) + T1
+ vmovdqa32 TMP3, [TBL + ((%%ROUND+1)*64)] ; Next Kt
+
+ ;; Rotate the args A-H (rotation of names associated with regs)
+ ROTATE_ARGS
+%endmacro
+
+%macro MSG_SCHED_ROUND_16_79 4
+%define %%WT %1
+%define %%WTp1 %2
+%define %%WTp9 %3
+%define %%WTp14 %4
+ vprorq TMP4, %%WTp14, SMALL_SIGMA_1_0 ; ROR_19(Wt-2)
+ vprorq TMP5, %%WTp14, SMALL_SIGMA_1_1 ; ROR_61(Wt-2)
+ vpsrlq TMP6, %%WTp14, SMALL_SIGMA_1_2 ; SHR_6(Wt-2)
+ vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_1(Wt-2)
+
+ vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2)
+ vpaddq %%WT, %%WT, %%WTp9 ; Wt = Wt-16 + sigma_1(Wt-2) + Wt-7
+
+ vprorq TMP4, %%WTp1, SMALL_SIGMA_0_0 ; ROR_1(Wt-15)
+ vprorq TMP5, %%WTp1, SMALL_SIGMA_0_1 ; ROR_8(Wt-15)
+ vpsrlq TMP6, %%WTp1, SMALL_SIGMA_0_2 ; SHR_7(Wt-15)
+ vpternlogq TMP4, TMP5, TMP6, 0x96 ; TMP4 = sigma_0(Wt-15)
+
+ vpaddq %%WT, %%WT, TMP4 ; Wt = Wt-16 + sigma_1(Wt-2) +
+ ; Wt-7 + sigma_0(Wt-15) +
+
+%endmacro
+
+align 64
+
+; void sha512_mb_x8_avx512(SHA512_MB_ARGS_X8, uint32_t size)
+; arg 1 : pointer to input data
+; arg 2 : size (in blocks) ;; assumed to be >= 1
+local_func_decl(sha512_mb_x8_avx512)
+sha512_mb_x8_avx512:
+ mov rax, rsp
+ sub rsp, STACK_SPACE
+ and rsp, ~63 ; align stack to multiple of 64
+ mov [rsp + _RSP], rax
+ lea TBL,[TABLE]
+
+ ;; Initialize digests
+ vmovups A, [DIGEST + 0*8*8]
+ vmovups B, [DIGEST + 1*8*8]
+ vmovups C, [DIGEST + 2*8*8]
+ vmovups D, [DIGEST + 3*8*8]
+ vmovups E, [DIGEST + 4*8*8]
+ vmovups F, [DIGEST + 5*8*8]
+ vmovups G, [DIGEST + 6*8*8]
+ vmovups H, [DIGEST + 7*8*8]
+
+ xor IDX, IDX
+ ;; Read in input data address, saving them in registers because
+ ;; they will serve as variables, which we shall keep incrementing
+ mov inp0, [IN + 0*8]
+ mov inp1, [IN + 1*8]
+ mov inp2, [IN + 2*8]
+ mov inp3, [IN + 3*8]
+ mov inp4, [IN + 4*8]
+ mov inp5, [IN + 5*8]
+ mov inp6, [IN + 6*8]
+ mov inp7, [IN + 7*8]
+
+lloop:
+
+ ;; first half of 1024 (need to transpose before use)
+ vmovups W0,[inp0 + IDX ]
+ vmovups W1,[inp1 + IDX ]
+ vmovups W2,[inp2 + IDX ]
+ vmovups W3,[inp3 + IDX ]
+ vmovups W4,[inp4 + IDX ]
+ vmovups W5,[inp5 + IDX ]
+ vmovups W6,[inp6 + IDX ]
+ vmovups TMP0,[inp7 + IDX ]
+ TRANSPOSE8 W0, W1, W2, W3, W4, W5, W6, TMP0, W7, TMP1, TMP2, TMP3
+ ;; second half of 1024 (need to transpose before use)
+ vmovups W8,[inp0 + SZ8 + IDX ]
+ vmovups W9,[inp1 + SZ8 + IDX ]
+ vmovups W10,[inp2 + SZ8 + IDX ]
+ vmovups W11,[inp3 + SZ8 + IDX ]
+ vmovups W12,[inp4 + SZ8 + IDX ]
+ vmovups W13,[inp5 + SZ8 + IDX ]
+ vmovups W14,[inp6 + SZ8 + IDX ]
+ vmovups TMP0,[inp7 + SZ8 + IDX ]
+ TRANSPOSE8 W8, W9, W10, W11, W12, W13, W14, TMP0, W15, TMP1, TMP2, TMP3
+
+ vmovdqa32 TMP2, [PSHUFFLE_BYTE_FLIP_MASK]
+
+ vmovdqa32 TMP3, [TBL] ; First K
+
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ add IDX, 128 ; increment by message block length in bytes
+
+
+
+
+%assign I 0
+%rep 16
+;;; little endian to big endian
+ vpshufb APPEND(W,I), APPEND(W,I), TMP2
+%assign I (I+1)
+%endrep
+ ; Save digests for later addition
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*0], A
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*1], B
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*2], C
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*3], D
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*4], E
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*5], F
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*6], G
+ vmovdqa32 [rsp + _DIGEST_SAVE + 64*7], H
+
+ ; MSG Schedule for W0-W15 is now complete in registers
+ ; Process first (max-rounds -16)
+ ; Calculate next Wt+16 after processing is complete and Wt is unneeded
+
+ ; PROCESS_LOOP_00_79 APPEND(W,J), I, APPEND(W,K), APPEND(W,L), APPEND(W,M)
+
+%assign I 0
+%assign J 0
+%assign K 1
+%assign L 9
+%assign M 14
+%rep SHA_ROUNDS_LESS_16
+ PROCESS_LOOP APPEND(W,J), I
+ MSG_SCHED_ROUND_16_79 APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
+%assign I (I+1)
+%assign J ((J+1)% 16)
+%assign K ((K+1)% 16)
+%assign L ((L+1)% 16)
+%assign M ((M+1)% 16)
+%endrep
+ ; Check is this is the last block
+ sub SIZE, 1
+ je lastLoop
+
+ ; Process last 16 rounds
+ ; Read in next block msg data for use in first 16 words of msg sched
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+ ; Add old digest
+ vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+ jmp lloop
+
+
+lastLoop:
+ ; Process last 16 rounds
+%assign I SHA_ROUNDS_LESS_16
+%assign J 0
+
+%rep 16
+ PROCESS_LOOP APPEND(W,J), I
+%assign I (I+1)
+%assign J (J+1)
+%endrep
+
+ ; Add old digest
+ vpaddq A, A, [rsp + _DIGEST_SAVE + 64*0]
+ vpaddq B, B, [rsp + _DIGEST_SAVE + 64*1]
+ vpaddq C, C, [rsp + _DIGEST_SAVE + 64*2]
+ vpaddq D, D, [rsp + _DIGEST_SAVE + 64*3]
+ vpaddq E, E, [rsp + _DIGEST_SAVE + 64*4]
+ vpaddq F, F, [rsp + _DIGEST_SAVE + 64*5]
+ vpaddq G, G, [rsp + _DIGEST_SAVE + 64*6]
+ vpaddq H, H, [rsp + _DIGEST_SAVE + 64*7]
+
+;; update into data pointers
+%assign I 0
+%rep 4
+ mov inp0, [IN + (2*I)*8]
+ mov inp1, [IN + (2*I +1)*8]
+ add inp0, IDX
+ add inp1, IDX
+ mov [IN + (2*I)*8], inp0
+ mov [IN + (2*I+1)*8], inp1
+%assign I (I+1)
+%endrep
+
+ VMOVDQ32 [DIGEST + 0*8*8], A
+ VMOVDQ32 [DIGEST + 1*8*8], B
+ VMOVDQ32 [DIGEST + 2*8*8], C
+ VMOVDQ32 [DIGEST + 3*8*8], D
+ VMOVDQ32 [DIGEST + 4*8*8], E
+ VMOVDQ32 [DIGEST + 5*8*8], F
+ VMOVDQ32 [DIGEST + 6*8*8], G
+ VMOVDQ32 [DIGEST + 7*8*8], H
+
+ mov rsp, [rsp + _RSP]
+ ret
+
+ section .data
+align 64
+; 80 constants for SHA512
+; replicating for each lane, thus 8*80
+; to aid in SIMD .. space tradeoff for time!
+; local to asm file, used nowhere else
+TABLE:
+ dq 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22, 0x428a2f98d728ae22
+ dq 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd, 0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f, 0xb5c0fbcfec4d3b2f
+ dq 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc, 0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538, 0x3956c25bf348b538
+ dq 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019, 0x59f111f1b605d019
+ dq 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b, 0x923f82a4af194f9b
+ dq 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118, 0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242, 0xd807aa98a3030242
+ dq 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe, 0x12835b0145706fbe
+ dq 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c, 0x243185be4ee4b28c
+ dq 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2, 0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f, 0x72be5d74f27b896f
+ dq 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1, 0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235, 0x9bdc06a725c71235
+ dq 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694, 0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2, 0xe49b69c19ef14ad2
+ dq 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3, 0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5, 0x0fc19dc68b8cd5b5
+ dq 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65, 0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275, 0x2de92c6f592b0275
+ dq 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483, 0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4, 0x5cb0a9dcbd41fbd4
+ dq 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5, 0x76f988da831153b5
+ dq 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab, 0x983e5152ee66dfab
+ dq 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210, 0xa831c66d2db43210
+ dq 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f, 0xb00327c898fb213f
+ dq 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4, 0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2, 0xc6e00bf33da88fc2
+ dq 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725, 0xd5a79147930aa725
+ dq 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f, 0x06ca6351e003826f
+ dq 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70, 0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc, 0x27b70a8546d22ffc
+ dq 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926, 0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed, 0x4d2c6dfc5ac42aed
+ dq 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df, 0x53380d139d95b3df
+ dq 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de, 0x650a73548baf63de
+ dq 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8, 0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6, 0x81c2c92e47edaee6
+ dq 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b, 0x92722c851482353b
+ dq 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364, 0xa2bfe8a14cf10364
+ dq 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001, 0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791, 0xc24b8b70d0f89791
+ dq 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30, 0xc76c51a30654be30
+ dq 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218, 0xd192e819d6ef5218
+ dq 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910, 0xd69906245565a910
+ dq 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a, 0xf40e35855771202a
+ dq 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8, 0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8, 0x19a4c116b8d2d0c8
+ dq 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53, 0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99, 0x2748774cdf8eeb99
+ dq 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8, 0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63, 0x391c0cb3c5c95a63
+ dq 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb, 0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373, 0x5b9cca4f7763e373
+ dq 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3, 0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc, 0x748f82ee5defb2fc
+ dq 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60, 0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72, 0x84c87814a1f0ab72
+ dq 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec, 0x8cc702081a6439ec
+ dq 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28, 0x90befffa23631e28
+ dq 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9, 0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915, 0xbef9a3f7b2c67915
+ dq 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b, 0xc67178f2e372532b
+ dq 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c, 0xca273eceea26619c
+ dq 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207, 0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e, 0xeada7dd6cde0eb1e
+ dq 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178, 0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba, 0x06f067aa72176fba
+ dq 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6, 0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae, 0x113f9804bef90dae
+ dq 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b, 0x1b710b35131c471b
+ dq 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84, 0x28db77f523047d84
+ dq 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493, 0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc, 0x3c9ebe0a15c9bebc
+ dq 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c, 0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6, 0x4cc5d4becb3e42b6
+ dq 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a, 0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec, 0x5fcb6fab3ad6faec
+ dq 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817, 0x6c44198c4a475817
+
+align 64
+; this does the big endian to little endian conversion over a quad word .. ZMM
+;; shuffle on ZMM is shuffle on 4 XMM size chunks, 128 bits
+PSHUFFLE_BYTE_FLIP_MASK: dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+ dq 0x1011121314151617, 0x18191a1b1c1d1e1f
+ dq 0x2021222324252627, 0x28292a2b2c2d2e2f
+ dq 0x3031323334353637, 0x38393a3b3c3d3e3f
+
+align 64
+TRANSPOSE8_PERM_INDEX_1: dq 0x0000000000000000
+ dq 0x0000000000000001
+ dq 0x0000000000000008
+ dq 0x0000000000000009
+ dq 0x0000000000000004
+ dq 0x0000000000000005
+ dq 0x000000000000000C
+ dq 0x000000000000000D
+
+TRANSPOSE8_PERM_INDEX_2: dq 0x0000000000000002
+ dq 0x0000000000000003
+ dq 0x000000000000000A
+ dq 0x000000000000000B
+ dq 0x0000000000000006
+ dq 0x0000000000000007
+ dq 0x000000000000000E
+ dq 0x000000000000000F
+
+%else
+%ifidn __OUTPUT_FORMAT__, win64
+global no_sha512_mb_x8_avx512
+no_sha512_mb_x8_avx512:
+%endif
+%endif ; HAVE_AS_KNOWS_AVX512
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm
new file mode 100644
index 000000000..e1186f8a0
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_multibinary.asm
@@ -0,0 +1,254 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%ifidn __OUTPUT_FORMAT__, elf64
+%define WRT_OPT wrt ..plt
+%else
+%define WRT_OPT
+%endif
+
+%include "reg_sizes.asm"
+%include "multibinary.asm"
+
+;;;;;
+; mbin_dispatch_init_avoton parameters
+; Use this function when SSE/00/01 is a minimum requirement
+; if AVOTON is true, then use avoton_func instead of sse_func
+; 1-> function name
+; 2-> SSE/00/01 optimized function used as base
+; 3-> AVX or AVX/02 opt func
+; 4-> AVX2 or AVX/04 opt func
+; 5-> AVOTON opt func
+;;;;;
+%macro mbin_dispatch_init_avoton 5
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
+
+ mov eax, 1
+ cpuid
+ lea mbin_rdi, [%5 WRT_OPT]
+ and eax, FLAG_CPUID1_EAX_STEP_MASK
+ cmp eax, FLAG_CPUID1_EAX_AVOTON
+ ; If Avoton, set Avoton symbol and exit
+ cmove mbin_rsi, mbin_rdi
+ je _%1_init_done
+
+ and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
+ lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
+ jne _%1_init_done ; AVX is not available so end
+ mov mbin_rsi, mbin_rbx
+
+ ;; Try for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
+ cmovne mbin_rsi, mbin_rbx
+
+ ;; Does it have xmm and ymm support
+ xor ecx, ecx
+ xgetbv
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ je _%1_init_done
+ lea mbin_rsi, [%2 WRT_OPT]
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+;;;;;
+; mbin_dispatch_init6_avoton parameters
+; if AVOTON is true, then use avoton_func instead of sse_func
+; 1-> function name
+; 2-> base function
+; 3-> SSE4_1 or 00/01 optimized function
+; 4-> AVX/02 opt func
+; 5-> AVX2/04 opt func
+; 6-> AVX512/06 opt func
+; 7-> AVOTON opt func
+;;;;;
+%macro mbin_dispatch_init6_avoton 7
+ section .text
+ %1_dispatch_init:
+ push mbin_rsi
+ push mbin_rax
+ push mbin_rbx
+ push mbin_rcx
+ push mbin_rdx
+ push mbin_rdi
+ lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function
+
+ mov eax, 1
+ cpuid
+ mov ebx, ecx ; save cpuid1.ecx
+ test ecx, FLAG_CPUID1_ECX_SSE4_1
+ je _%1_init_done ; Use base function if no SSE4_1
+ lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
+
+ lea mbin_rdi, [%7 WRT_OPT]
+ and eax, FLAG_CPUID1_EAX_STEP_MASK
+ cmp eax, FLAG_CPUID1_EAX_AVOTON
+ ; If Avoton, set Avoton symbol and exit
+ cmove mbin_rsi, mbin_rdi
+ je _%1_init_done
+
+
+ ;; Test for XMM_YMM support/AVX
+ test ecx, FLAG_CPUID1_ECX_OSXSAVE
+ je _%1_init_done
+ xor ecx, ecx
+ xgetbv ; xcr -> edx:eax
+ mov edi, eax ; save xgetvb.eax
+
+ and eax, FLAG_XGETBV_EAX_XMM_YMM
+ cmp eax, FLAG_XGETBV_EAX_XMM_YMM
+ jne _%1_init_done
+ test ebx, FLAG_CPUID1_ECX_AVX
+ je _%1_init_done
+ lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
+
+ ;; Test for AVX2
+ xor ecx, ecx
+ mov eax, 7
+ cpuid
+ test ebx, FLAG_CPUID7_EBX_AVX2
+ je _%1_init_done ; No AVX2 possible
+ lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func
+
+ ;; Test for AVX512
+ and edi, FLAG_XGETBV_EAX_ZMM_OPM
+ cmp edi, FLAG_XGETBV_EAX_ZMM_OPM
+ jne _%1_init_done ; No AVX512 possible
+ and ebx, FLAGS_CPUID7_ECX_AVX512_G1
+ cmp ebx, FLAGS_CPUID7_ECX_AVX512_G1
+ lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
+ cmove mbin_rsi, mbin_rbx
+
+ _%1_init_done:
+ pop mbin_rdi
+ pop mbin_rdx
+ pop mbin_rcx
+ pop mbin_rbx
+ pop mbin_rax
+ mov [%1_dispatched], mbin_rsi
+ pop mbin_rsi
+ ret
+%endmacro
+
+default rel
+[bits 64]
+
+%define def_wrd dq
+%define wrd_sz qword
+%define arg1 rsi
+
+; declare the L3 ctx level symbols (these will then call the appropriate
+; L2 symbols)
+extern sha512_ctx_mgr_init_sse
+extern sha512_ctx_mgr_submit_sse
+extern sha512_ctx_mgr_flush_sse
+
+extern sha512_ctx_mgr_init_avx
+extern sha512_ctx_mgr_submit_avx
+extern sha512_ctx_mgr_flush_avx
+
+extern sha512_ctx_mgr_init_avx2
+extern sha512_ctx_mgr_submit_avx2
+extern sha512_ctx_mgr_flush_avx2
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ extern sha512_ctx_mgr_init_avx512
+ extern sha512_ctx_mgr_submit_avx512
+ extern sha512_ctx_mgr_flush_avx512
+%endif
+
+extern sha512_ctx_mgr_init_sb_sse4
+extern sha512_ctx_mgr_submit_sb_sse4
+extern sha512_ctx_mgr_flush_sb_sse4
+
+;;; *_mbinit are initial values for *_dispatched; is updated on first call.
+;;; Therefore, *_dispatch_init is only executed on first call.
+
+; Initialise symbols
+mbin_interface sha512_ctx_mgr_init
+mbin_interface sha512_ctx_mgr_submit
+mbin_interface sha512_ctx_mgr_flush
+
+%ifdef HAVE_AS_KNOWS_AVX512
+ ; Reuse mbin_dispatch_init6 through replacing base by sse version
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_sse, \
+ sha512_ctx_mgr_init_sse, sha512_ctx_mgr_init_avx, \
+ sha512_ctx_mgr_init_avx2, sha512_ctx_mgr_init_avx512, \
+ sha512_ctx_mgr_init_sb_sse4
+
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_sse, \
+ sha512_ctx_mgr_submit_sse, sha512_ctx_mgr_submit_avx, \
+ sha512_ctx_mgr_submit_avx2, sha512_ctx_mgr_submit_avx512, \
+ sha512_ctx_mgr_submit_sb_sse4
+
+ mbin_dispatch_init6_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_sse, \
+ sha512_ctx_mgr_flush_sse, sha512_ctx_mgr_flush_avx, \
+ sha512_ctx_mgr_flush_avx2, sha512_ctx_mgr_flush_avx512, \
+ sha512_ctx_mgr_flush_sb_sse4
+%else
+ mbin_dispatch_init_avoton sha512_ctx_mgr_init, sha512_ctx_mgr_init_sse, \
+ sha512_ctx_mgr_init_avx, sha512_ctx_mgr_init_avx2, \
+ sha512_ctx_mgr_init_sb_sse4
+
+ mbin_dispatch_init_avoton sha512_ctx_mgr_submit, sha512_ctx_mgr_submit_sse, \
+ sha512_ctx_mgr_submit_avx, sha512_ctx_mgr_submit_avx2, \
+ sha512_ctx_mgr_submit_sb_sse4
+
+ mbin_dispatch_init_avoton sha512_ctx_mgr_flush, sha512_ctx_mgr_flush_sse, \
+ sha512_ctx_mgr_flush_avx, sha512_ctx_mgr_flush_avx2, \
+ sha512_ctx_mgr_flush_sb_sse4
+%endif
+
+
+;;; func core, ver, snum
+slversion sha512_ctx_mgr_init, 00, 03, 0175
+slversion sha512_ctx_mgr_submit, 00, 03, 0176
+slversion sha512_ctx_mgr_flush, 00, 03, 0177
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c
new file mode 100644
index 000000000..bb9a8f5e8
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_ref.c
@@ -0,0 +1,256 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include "sha512_mb.h"
+
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+// Reference SHA512 Functions
+////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////
+
+#define H0 0x6a09e667f3bcc908
+#define H1 0xbb67ae8584caa73b
+#define H2 0x3c6ef372fe94f82b
+#define H3 0xa54ff53a5f1d36f1
+#define H4 0x510e527fade682d1
+#define H5 0x9b05688c2b3e6c1f
+#define H6 0x1f83d9abfb41bd6b
+#define H7 0x5be0cd19137e2179
+
+void sha512_single(const uint8_t * data, uint64_t digest[5]);
+
+void sha512_ref(uint8_t * input_data, uint64_t * digest, uint32_t len)
+{
+ uint32_t i, j;
+ uint8_t buf[2 * SHA512_BLOCK_SIZE];
+
+ /* 128 bit lengths not needed as len is uint32_t, so use 64 bit length
+ * and pad the first 64 bits with zeros. */
+ union {
+ uint64_t uint;
+ uint8_t uchar[8];
+ } convert;
+ uint8_t *p;
+
+ digest[0] = H0;
+ digest[1] = H1;
+ digest[2] = H2;
+ digest[3] = H3;
+ digest[4] = H4;
+ digest[5] = H5;
+ digest[6] = H6;
+ digest[7] = H7;
+
+ i = len;
+ /* Hash the complete blocks */
+ while (i >= SHA512_BLOCK_SIZE) {
+ sha512_single(input_data, digest);
+ input_data += SHA512_BLOCK_SIZE;
+ i -= SHA512_BLOCK_SIZE;
+ }
+
+ /* Copy remainder to a buffer to be padded */
+ memcpy(buf, input_data, i);
+ buf[i++] = 0x80;
+
+ // Pad more than required here and overwrite with length
+ for (j = i; j < (2 * SHA512_BLOCK_SIZE); j++)
+ buf[j] = 0;
+
+ if (i > SHA512_BLOCK_SIZE - SHA512_PADLENGTHFIELD_SIZE)
+ i = 2 * SHA512_BLOCK_SIZE;
+ else
+ i = SHA512_BLOCK_SIZE;
+
+ convert.uint = 8 * len;
+ p = buf + i - 8;
+ p[0] = convert.uchar[7];
+ p[1] = convert.uchar[6];
+ p[2] = convert.uchar[5];
+ p[3] = convert.uchar[4];
+ p[4] = convert.uchar[3];
+ p[5] = convert.uchar[2];
+ p[6] = convert.uchar[1];
+ p[7] = convert.uchar[0];
+
+ /* Hash the padded last block */
+ sha512_single(buf, digest);
+ if (i == 256)
+ sha512_single(buf + 128, digest);
+}
+
+/* From the FIPS, these are the same as for SHA256, but operating on 64 bit words
+ * instead of 32 bit.
+ */
+#define ch(e,f,g) ((e & f) ^ (g & ~e))
+#define maj(a,b,c) ((a & b) ^ (a & c) ^ (b & c))
+
+/* Sigma functions have same form as SHA256 but
+ * - change the word size to 64bit
+ * - change the amount to rotate
+ */
+#define ror64(x, r) (((x)>>(r)) ^ ((x)<<(64-(r))))
+
+/* Technically, s0 should be S0 as these are "capital sigma" functions, and likewise the case
+ * of the S0 should be s0, but keep as-is to avoid confusion with the other reference functions.
+ */
+#define s0(a) (ror64(a,28) ^ ror64(a,34) ^ ror64(a,39))
+#define s1(e) (ror64(e,14) ^ ror64(e,18) ^ ror64(e,41))
+
+#define S0(w) (ror64(w,1) ^ ror64(w,8) ^ (w >> 7))
+#define S1(w) (ror64(w,19) ^ ror64(w,61) ^ (w >> 6))
+
+#define bswap(x) (((x) & (0xffull << 0)) << 56) \
+ | (((x) & (0xffull << 8)) << 40) \
+ | (((x) & (0xffull <<16)) << 24) \
+ | (((x) & (0xffull <<24)) << 8) \
+ | (((x) & (0xffull <<32)) >> 8) \
+ | (((x) & (0xffull <<40)) >> 24) \
+ | (((x) & (0xffull <<48)) >> 40) \
+ | (((x) & (0xffull <<56)) >> 56)
+
+#define W(x) w[(x) & 15]
+
+#define step(i,a,b,c,d,e,f,g,h,k) \
+ if (i<16) W(i) = bswap(ww[i]); \
+ else \
+ W(i) = W(i-16) + S0(W(i-15)) + W(i-7) + S1(W(i-2)); \
+ t2 = s0(a) + maj(a,b,c); \
+ t1 = h + s1(e) + ch(e,f,g) + k + W(i); \
+ d += t1; \
+ h = t1 + t2;
+
+void sha512_single(const uint8_t * data, uint64_t digest[5])
+{
+ /* Check these are all uint64_t */
+ uint64_t a, b, c, d, e, f, g, h, t1, t2;
+ uint64_t w[16];
+ uint64_t *ww = (uint64_t *) data;
+
+ a = digest[0];
+ b = digest[1];
+ c = digest[2];
+ d = digest[3];
+ e = digest[4];
+ f = digest[5];
+ g = digest[6];
+ h = digest[7];
+
+ step(0, a, b, c, d, e, f, g, h, 0x428a2f98d728ae22);
+ step(1, h, a, b, c, d, e, f, g, 0x7137449123ef65cd);
+ step(2, g, h, a, b, c, d, e, f, 0xb5c0fbcfec4d3b2f);
+ step(3, f, g, h, a, b, c, d, e, 0xe9b5dba58189dbbc);
+ step(4, e, f, g, h, a, b, c, d, 0x3956c25bf348b538);
+ step(5, d, e, f, g, h, a, b, c, 0x59f111f1b605d019);
+ step(6, c, d, e, f, g, h, a, b, 0x923f82a4af194f9b);
+ step(7, b, c, d, e, f, g, h, a, 0xab1c5ed5da6d8118);
+ step(8, a, b, c, d, e, f, g, h, 0xd807aa98a3030242);
+ step(9, h, a, b, c, d, e, f, g, 0x12835b0145706fbe);
+ step(10, g, h, a, b, c, d, e, f, 0x243185be4ee4b28c);
+ step(11, f, g, h, a, b, c, d, e, 0x550c7dc3d5ffb4e2);
+ step(12, e, f, g, h, a, b, c, d, 0x72be5d74f27b896f);
+ step(13, d, e, f, g, h, a, b, c, 0x80deb1fe3b1696b1);
+ step(14, c, d, e, f, g, h, a, b, 0x9bdc06a725c71235);
+ step(15, b, c, d, e, f, g, h, a, 0xc19bf174cf692694);
+ step(16, a, b, c, d, e, f, g, h, 0xe49b69c19ef14ad2);
+ step(17, h, a, b, c, d, e, f, g, 0xefbe4786384f25e3);
+ step(18, g, h, a, b, c, d, e, f, 0x0fc19dc68b8cd5b5);
+ step(19, f, g, h, a, b, c, d, e, 0x240ca1cc77ac9c65);
+ step(20, e, f, g, h, a, b, c, d, 0x2de92c6f592b0275);
+ step(21, d, e, f, g, h, a, b, c, 0x4a7484aa6ea6e483);
+ step(22, c, d, e, f, g, h, a, b, 0x5cb0a9dcbd41fbd4);
+ step(23, b, c, d, e, f, g, h, a, 0x76f988da831153b5);
+ step(24, a, b, c, d, e, f, g, h, 0x983e5152ee66dfab);
+ step(25, h, a, b, c, d, e, f, g, 0xa831c66d2db43210);
+ step(26, g, h, a, b, c, d, e, f, 0xb00327c898fb213f);
+ step(27, f, g, h, a, b, c, d, e, 0xbf597fc7beef0ee4);
+ step(28, e, f, g, h, a, b, c, d, 0xc6e00bf33da88fc2);
+ step(29, d, e, f, g, h, a, b, c, 0xd5a79147930aa725);
+ step(30, c, d, e, f, g, h, a, b, 0x06ca6351e003826f);
+ step(31, b, c, d, e, f, g, h, a, 0x142929670a0e6e70);
+ step(32, a, b, c, d, e, f, g, h, 0x27b70a8546d22ffc);
+ step(33, h, a, b, c, d, e, f, g, 0x2e1b21385c26c926);
+ step(34, g, h, a, b, c, d, e, f, 0x4d2c6dfc5ac42aed);
+ step(35, f, g, h, a, b, c, d, e, 0x53380d139d95b3df);
+ step(36, e, f, g, h, a, b, c, d, 0x650a73548baf63de);
+ step(37, d, e, f, g, h, a, b, c, 0x766a0abb3c77b2a8);
+ step(38, c, d, e, f, g, h, a, b, 0x81c2c92e47edaee6);
+ step(39, b, c, d, e, f, g, h, a, 0x92722c851482353b);
+ step(40, a, b, c, d, e, f, g, h, 0xa2bfe8a14cf10364);
+ step(41, h, a, b, c, d, e, f, g, 0xa81a664bbc423001);
+ step(42, g, h, a, b, c, d, e, f, 0xc24b8b70d0f89791);
+ step(43, f, g, h, a, b, c, d, e, 0xc76c51a30654be30);
+ step(44, e, f, g, h, a, b, c, d, 0xd192e819d6ef5218);
+ step(45, d, e, f, g, h, a, b, c, 0xd69906245565a910);
+ step(46, c, d, e, f, g, h, a, b, 0xf40e35855771202a);
+ step(47, b, c, d, e, f, g, h, a, 0x106aa07032bbd1b8);
+ step(48, a, b, c, d, e, f, g, h, 0x19a4c116b8d2d0c8);
+ step(49, h, a, b, c, d, e, f, g, 0x1e376c085141ab53);
+ step(50, g, h, a, b, c, d, e, f, 0x2748774cdf8eeb99);
+ step(51, f, g, h, a, b, c, d, e, 0x34b0bcb5e19b48a8);
+ step(52, e, f, g, h, a, b, c, d, 0x391c0cb3c5c95a63);
+ step(53, d, e, f, g, h, a, b, c, 0x4ed8aa4ae3418acb);
+ step(54, c, d, e, f, g, h, a, b, 0x5b9cca4f7763e373);
+ step(55, b, c, d, e, f, g, h, a, 0x682e6ff3d6b2b8a3);
+ step(56, a, b, c, d, e, f, g, h, 0x748f82ee5defb2fc);
+ step(57, h, a, b, c, d, e, f, g, 0x78a5636f43172f60);
+ step(58, g, h, a, b, c, d, e, f, 0x84c87814a1f0ab72);
+ step(59, f, g, h, a, b, c, d, e, 0x8cc702081a6439ec);
+ step(60, e, f, g, h, a, b, c, d, 0x90befffa23631e28);
+ step(61, d, e, f, g, h, a, b, c, 0xa4506cebde82bde9);
+ step(62, c, d, e, f, g, h, a, b, 0xbef9a3f7b2c67915);
+ step(63, b, c, d, e, f, g, h, a, 0xc67178f2e372532b); // step 63
+ step(64, a, b, c, d, e, f, g, h, 0xca273eceea26619c);
+ step(65, h, a, b, c, d, e, f, g, 0xd186b8c721c0c207);
+ step(66, g, h, a, b, c, d, e, f, 0xeada7dd6cde0eb1e);
+ step(67, f, g, h, a, b, c, d, e, 0xf57d4f7fee6ed178);
+ step(68, e, f, g, h, a, b, c, d, 0x06f067aa72176fba);
+ step(69, d, e, f, g, h, a, b, c, 0x0a637dc5a2c898a6);
+ step(70, c, d, e, f, g, h, a, b, 0x113f9804bef90dae);
+ step(71, b, c, d, e, f, g, h, a, 0x1b710b35131c471b);
+ step(72, a, b, c, d, e, f, g, h, 0x28db77f523047d84);
+ step(73, h, a, b, c, d, e, f, g, 0x32caab7b40c72493);
+ step(74, g, h, a, b, c, d, e, f, 0x3c9ebe0a15c9bebc);
+ step(75, f, g, h, a, b, c, d, e, 0x431d67c49c100d4c);
+ step(76, e, f, g, h, a, b, c, d, 0x4cc5d4becb3e42b6);
+ step(77, d, e, f, g, h, a, b, c, 0x597f299cfc657e2a);
+ step(78, c, d, e, f, g, h, a, b, 0x5fcb6fab3ad6faec);
+ step(79, b, c, d, e, f, g, h, a, 0x6c44198c4a475817); // step 79
+
+ digest[0] += a;
+ digest[1] += b;
+ digest[2] += c;
+ digest[3] += d;
+ digest[4] += e;
+ digest[5] += f;
+ digest[6] += g;
+ digest[7] += h;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c
new file mode 100644
index 000000000..6eeed19fd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_flush_sse4.c
@@ -0,0 +1,46 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "sha512_mb.h"
+
+/*
+ * Function: sha512_sb_mgr_flush_sse4.
+ *
+ * Description: This is a dummy API. Nothing done here.
+ *
+ * Return: always NULL.
+ *
+ * */
+SHA512_JOB *sha512_sb_mgr_flush_sse4(SHA512_MB_JOB_MGR * state)
+{
+ return NULL;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c
new file mode 100644
index 000000000..93ce88dfd
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_init_sse4.c
@@ -0,0 +1,38 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include "sha512_mb.h"
+
+// For single buffer APIs, nothing to be done here.
+// This function is required, to comply with the usage of
+// multi-buffer APIs.
+void sha512_sb_mgr_init_sse4(SHA512_MB_JOB_MGR * state)
+{
+ return;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c
new file mode 100644
index 000000000..659d14339
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sb_mgr_submit_sse4.c
@@ -0,0 +1,65 @@
+/**********************************************************************
+ Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+ * Neither the name of Intel Corporation nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include "sha512_mb.h"
+
+/*
+ * Function: sha512_sb_mgr_submit_sse4
+ *
+ * Description: Wrapper API for update routine of single buffer sha512,
+ * to comply with multi-buffer API.
+ *
+ * This function will pick up message/digest and length
+ * information from the argument "job", then call into
+ * sha512_sse4(). Argument "state" is passed in, but not
+ * really used here.
+ *
+ * Note: message init and padding is done outside. This function
+ * expects a packed buffer.
+ *
+ * Argument: state - not really used.
+ * job - contained message, digest, message length information, etc.
+ *
+ * Return: SHA512_JOB pointer.
+ *
+ **/
+SHA512_JOB *sha512_sb_mgr_submit_sse4(SHA512_MB_JOB_MGR * state, SHA512_JOB * job)
+{
+ assert(job != NULL);
+
+ uint8_t *buff = job->buffer;
+ uint64_t *digest = job->result_digest, len = job->len;
+
+ sha512_sse4((const void *)buff, (void *)digest, len);
+
+ return job;
+}
diff --git a/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm
new file mode 100644
index 000000000..57598a0e2
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/sha512_mb/sha512_sse4.asm
@@ -0,0 +1,394 @@
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Copyright(c) 2011-2016 Intel Corporation All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
+; * Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
+; * Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in
+; the documentation and/or other materials provided with the
+; distribution.
+; * Neither the name of Intel Corporation nor the names of its
+; contributors may be used to endorse or promote products derived
+; from this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+%include "reg_sizes.asm"
+default rel
+BITS 64
+section .text
+
+; Virtual Registers
+%ifidn __OUTPUT_FORMAT__, win64
+ %define msg rcx ; ARG1
+ %define digest rdx ; ARG2
+ %define msglen r8 ; ARG3
+ %define T1 rsi
+ %define T2 rdi
+%else
+ %define msg rdi ; ARG1
+ %define digest rsi ; ARG2
+ %define msglen rdx ; ARG3
+ %define T1 rcx
+ %define T2 r8
+%endif
+%define a_64 r9
+%define b_64 r10
+%define c_64 r11
+%define d_64 r12
+%define e_64 r13
+%define f_64 r14
+%define g_64 r15
+%define h_64 rbx
+%define tmp0 rax
+
+; Local variables (stack frame)
+; Note: frame_size must be an odd multiple of 8 bytes to XMM align RSP
+struc frame
+ .W: resq 80 ; Message Schedule
+ .WK: resq 2 ; W[t] + K[t] | W[t+1] + K[t+1]
+
+%ifidn __OUTPUT_FORMAT__, win64
+ .GPRSAVE: resq 7
+%else
+ .GPRSAVE: resq 5
+%endif
+endstruc
+
+; Useful QWORD "arrays" for simpler memory references
+%define MSG(i) msg + 8*(i) ; Input message (arg1)
+%define DIGEST(i) digest + 8*(i) ; Output Digest (arg2)
+%define K_t(i) K512 + 8*(i) ; SHA Constants (static mem)
+%define W_t(i) rsp + frame.W + 8*(i) ; Message Schedule (stack frame)
+%define WK_2(i) rsp + frame.WK + 8*((i) % 2) ; W[t]+K[t] (stack frame)
+; MSG, DIGEST, K_t, W_t are arrays
+; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
+
+%macro RotateState 0
+ ; Rotate symbles a..h right
+ %xdefine %%TMP h_64
+ %xdefine h_64 g_64
+ %xdefine g_64 f_64
+ %xdefine f_64 e_64
+ %xdefine e_64 d_64
+ %xdefine d_64 c_64
+ %xdefine c_64 b_64
+ %xdefine b_64 a_64
+ %xdefine a_64 %%TMP
+%endmacro
+
+%macro SHA512_Round 1
+%assign %%t (%1)
+
+ ; Compute Round %%t
+ mov T1, f_64 ; T1 = f
+ mov tmp0, e_64 ; tmp = e
+ xor T1, g_64 ; T1 = f ^ g
+ ror tmp0, 23 ; 41 ; tmp = e ror 23
+ and T1, e_64 ; T1 = (f ^ g) & e
+ xor tmp0, e_64 ; tmp = (e ror 23) ^ e
+ xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
+ add T1, [WK_2(%%t)] ; W[t] + K[t] from message scheduler
+ ror tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4
+ xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e
+ mov T2, a_64 ; T2 = a
+ add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h
+ ror tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
+ add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
+ mov tmp0, a_64 ; tmp = a
+ xor T2, c_64 ; T2 = a ^ c
+ and tmp0, c_64 ; tmp = a & c
+ and T2, b_64 ; T2 = (a ^ c) & b
+ xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
+ mov tmp0, a_64 ; tmp = a
+ ror tmp0, 5 ; 39 ; tmp = a ror 5
+ xor tmp0, a_64 ; tmp = (a ror 5) ^ a
+ add d_64, T1 ; e(next_state) = d + T1
+ ror tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6
+ xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a
+ lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c)
+ ror tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
+ add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a)
+ RotateState
+%endmacro
+
+%macro SHA512_2Sched_2Round_sse 1
+%assign %%t (%1)
+
+ ; Compute rounds %%t-2 and %%t-1
+ ; Compute message schedule QWORDS %%t and %%t+1
+
+ ; Two rounds are computed based on the values for K[t-2]+W[t-2] and
+ ; K[t-1]+W[t-1] which were previously stored at WK_2 by the message
+ ; scheduler.
+ ; The two new schedule QWORDS are stored at [W_t(%%t)] and [W_t(%%t+1)].
+ ; They are then added to their respective SHA512 constants at
+ ; [K_t(%%t)] and [K_t(%%t+1)] and stored at dqword [WK_2(%%t)]
+ ; For brievity, the comments following vectored instructions only refer to
+ ; the first of a pair of QWORDS.
+ ; Eg. XMM2=W[t-2] really means XMM2={W[t-2]|W[t-1]}
+ ; The computation of the message schedule and the rounds are tightly
+ ; stitched to take advantage of instruction-level parallelism.
+ ; For clarity, integer instructions (for the rounds calculation) are indented
+ ; by one tab. Vectored instructions (for the message scheduler) are indented
+ ; by two tabs.
+
+ mov T1, f_64
+ movdqa xmm2, [W_t(%%t-2)] ; XMM2 = W[t-2]
+ xor T1, g_64
+ and T1, e_64
+ movdqa xmm0, xmm2 ; XMM0 = W[t-2]
+ xor T1, g_64
+ add T1, [WK_2(%%t)]
+ movdqu xmm5, [W_t(%%t-15)] ; XMM5 = W[t-15]
+ mov tmp0, e_64
+ ror tmp0, 23 ; 41
+ movdqa xmm3, xmm5 ; XMM3 = W[t-15]
+ xor tmp0, e_64
+ ror tmp0, 4 ; 18
+ psrlq xmm0, 61 - 19 ; XMM0 = W[t-2] >> 42
+ xor tmp0, e_64
+ ror tmp0, 14 ; 14
+ psrlq xmm3, (8 - 7) ; XMM3 = W[t-15] >> 1
+ add T1, tmp0
+ add T1, h_64
+ pxor xmm0, xmm2 ; XMM0 = (W[t-2] >> 42) ^ W[t-2]
+ mov T2, a_64
+ xor T2, c_64
+ pxor xmm3, xmm5 ; XMM3 = (W[t-15] >> 1) ^ W[t-15]
+ and T2, b_64
+ mov tmp0, a_64
+ psrlq xmm0, 19 - 6 ; XMM0 = ((W[t-2]>>42)^W[t-2])>>13
+ and tmp0, c_64
+ xor T2, tmp0
+ psrlq xmm3, (7 - 1) ; XMM3 = ((W[t-15]>>1)^W[t-15])>>6
+ mov tmp0, a_64
+ ror tmp0, 5 ; 39
+ pxor xmm0, xmm2 ; XMM0 = (((W[t-2]>>42)^W[t-2])>>13)^W[t-2]
+ xor tmp0, a_64
+ ror tmp0, 6 ; 34
+ pxor xmm3, xmm5 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]
+ xor tmp0, a_64
+ ror tmp0, 28 ; 28
+ psrlq xmm0, 6 ; XMM0 = ((((W[t-2]>>42)^W[t-2])>>13)^W[t-2])>>6
+ add T2, tmp0
+ add d_64, T1
+ psrlq xmm3, 1 ; XMM3 = (((W[t-15]>>1)^W[t-15])>>6)^W[t-15]>>1
+ lea h_64, [T1 + T2]
+ RotateState
+ movdqa xmm1, xmm2 ; XMM1 = W[t-2]
+ mov T1, f_64
+ xor T1, g_64
+ movdqa xmm4, xmm5 ; XMM4 = W[t-15]
+ and T1, e_64
+ xor T1, g_64
+ psllq xmm1, (64 - 19) - (64 - 61) ; XMM1 = W[t-2] << 42
+ add T1, [WK_2(%%t+1)]
+ mov tmp0, e_64
+ psllq xmm4, (64 - 1) - (64 - 8) ; XMM4 = W[t-15] << 7
+ ror tmp0, 23 ; 41
+ xor tmp0, e_64
+ pxor xmm1, xmm2 ; XMM1 = (W[t-2] << 42)^W[t-2]
+ ror tmp0, 4 ; 18
+ xor tmp0, e_64
+ pxor xmm4, xmm5 ; XMM4 = (W[t-15]<<7)^W[t-15]
+ ror tmp0, 14 ; 14
+ add T1, tmp0
+ psllq xmm1, (64 - 61) ; XMM1 = ((W[t-2] << 42)^W[t-2])<<3
+ add T1, h_64
+ mov T2, a_64
+ psllq xmm4, (64 - 8) ; XMM4 = ((W[t-15]<<7)^W[t-15])<<56
+ xor T2, c_64
+ and T2, b_64
+ pxor xmm0, xmm1 ; XMM0 = s1(W[t-2])
+ mov tmp0, a_64
+ and tmp0, c_64
+ movdqu xmm1, [W_t(%%t- 7)] ; XMM1 = W[t-7]
+ xor T2, tmp0
+ pxor xmm3, xmm4 ; XMM3 = s0(W[t-15])
+ mov tmp0, a_64
+ paddq xmm0, xmm3 ; XMM0 = s1(W[t-2]) + s0(W[t-15])
+ ror tmp0, 5 ; 39
+ paddq xmm0, [W_t(%%t-16)] ; XMM0 = s1(W[t-2]) + s0(W[t-15]) + W[t-16]
+ xor tmp0, a_64
+ paddq xmm0, xmm1 ; XMM0 = s1(W[t-2]) + W[t-7] + s0(W[t-15]) + W[t-16]
+ ror tmp0, 6 ; 34
+ movdqa [W_t(%%t)], xmm0 ; Store scheduled qwords
+ xor tmp0, a_64
+ paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ ror tmp0, 28 ; 28
+ movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] for next rounds
+ add T2, tmp0
+ add d_64, T1
+ lea h_64, [T1 + T2]
+ RotateState
+%endmacro
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; void sha512_sse4(const void* M, void* D, uint64_t L);
+; Purpose: Updates the SHA512 digest stored at D with the message stored in M.
+; The size of the message pointed to by M must be an integer multiple of SHA512
+; message blocks.
+; L is the message length in SHA512 blocks.
+global sha512_sse4:function
+sha512_sse4:
+ cmp msglen, 0
+ je .nowork
+
+ ; Allocate Stack Space
+ sub rsp, frame_size
+
+ ; Save GPRs
+ mov [rsp + frame.GPRSAVE + 8 * 0], rbx
+ mov [rsp + frame.GPRSAVE + 8 * 1], r12
+ mov [rsp + frame.GPRSAVE + 8 * 2], r13
+ mov [rsp + frame.GPRSAVE + 8 * 3], r14
+ mov [rsp + frame.GPRSAVE + 8 * 4], r15
+%ifidn __OUTPUT_FORMAT__, win64
+ mov [rsp + frame.GPRSAVE + 8 * 5], rsi
+ mov [rsp + frame.GPRSAVE + 8 * 6], rdi
+%endif
+
+.updateblock:
+
+ ; Load state variables
+ mov a_64, [DIGEST(0)]
+ mov b_64, [DIGEST(1)]
+ mov c_64, [DIGEST(2)]
+ mov d_64, [DIGEST(3)]
+ mov e_64, [DIGEST(4)]
+ mov f_64, [DIGEST(5)]
+ mov g_64, [DIGEST(6)]
+ mov h_64, [DIGEST(7)]
+
+ %assign t 0
+ %rep 80/2 + 1
+ ; (80 rounds) / (2 rounds/iteration) + (1 iteration)
+ ; +1 iteration because the scheduler leads hashing by 1 iteration
+ %if t < 2
+ ; BSWAP 2 QWORDS
+ movdqa xmm1, [XMM_QWORD_BSWAP]
+ movdqu xmm0, [MSG(t)]
+ pshufb xmm0, xmm1 ; BSWAP
+ movdqa [W_t(t)], xmm0 ; Store Scheduled Pair
+ paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ movdqa [WK_2(t)], xmm0 ; Store into WK for rounds
+ %elif t < 16
+ ; BSWAP 2 QWORDS; Compute 2 Rounds
+ movdqu xmm0, [MSG(t)]
+ pshufb xmm0, xmm1 ; BSWAP
+ SHA512_Round t - 2 ; Round t-2
+ movdqa [W_t(t)], xmm0 ; Store Scheduled Pair
+ paddq xmm0, [K_t(t)] ; Compute W[t]+K[t]
+ SHA512_Round t - 1 ; Round t-1
+ movdqa [WK_2(t)], xmm0 ; Store W[t]+K[t] into WK
+ %elif t < 79
+ ; Schedule 2 QWORDS; Compute 2 Rounds
+ SHA512_2Sched_2Round_sse t
+ %else
+ ; Compute 2 Rounds
+ SHA512_Round t - 2
+ SHA512_Round t - 1
+ %endif
+ %assign t t+2
+ %endrep
+
+ ; Update digest
+ add [DIGEST(0)], a_64
+ add [DIGEST(1)], b_64
+ add [DIGEST(2)], c_64
+ add [DIGEST(3)], d_64
+ add [DIGEST(4)], e_64
+ add [DIGEST(5)], f_64
+ add [DIGEST(6)], g_64
+ add [DIGEST(7)], h_64
+
+ ; Advance to next message block
+ add msg, 16*8
+ dec msglen
+ jnz .updateblock
+
+ ; Restore GPRs
+ mov rbx, [rsp + frame.GPRSAVE + 8 * 0]
+ mov r12, [rsp + frame.GPRSAVE + 8 * 1]
+ mov r13, [rsp + frame.GPRSAVE + 8 * 2]
+ mov r14, [rsp + frame.GPRSAVE + 8 * 3]
+ mov r15, [rsp + frame.GPRSAVE + 8 * 4]
+%ifidn __OUTPUT_FORMAT__, win64
+ mov rsi, [rsp + frame.GPRSAVE + 8 * 5]
+ mov rdi, [rsp + frame.GPRSAVE + 8 * 6]
+%endif
+ ; Restore Stack Pointer
+ add rsp, frame_size
+
+.nowork:
+ ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; Binary Data
+
+section .data
+
+ALIGN 16
+
+; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
+XMM_QWORD_BSWAP:
+ dq 0x0001020304050607, 0x08090a0b0c0d0e0f
+
+; K[t] used in SHA512 hashing
+K512:
+ dq 0x428a2f98d728ae22,0x7137449123ef65cd
+ dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+ dq 0x3956c25bf348b538,0x59f111f1b605d019
+ dq 0x923f82a4af194f9b,0xab1c5ed5da6d8118
+ dq 0xd807aa98a3030242,0x12835b0145706fbe
+ dq 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+ dq 0x72be5d74f27b896f,0x80deb1fe3b1696b1
+ dq 0x9bdc06a725c71235,0xc19bf174cf692694
+ dq 0xe49b69c19ef14ad2,0xefbe4786384f25e3
+ dq 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+ dq 0x2de92c6f592b0275,0x4a7484aa6ea6e483
+ dq 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+ dq 0x983e5152ee66dfab,0xa831c66d2db43210
+ dq 0xb00327c898fb213f,0xbf597fc7beef0ee4
+ dq 0xc6e00bf33da88fc2,0xd5a79147930aa725
+ dq 0x06ca6351e003826f,0x142929670a0e6e70
+ dq 0x27b70a8546d22ffc,0x2e1b21385c26c926
+ dq 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+ dq 0x650a73548baf63de,0x766a0abb3c77b2a8
+ dq 0x81c2c92e47edaee6,0x92722c851482353b
+ dq 0xa2bfe8a14cf10364,0xa81a664bbc423001
+ dq 0xc24b8b70d0f89791,0xc76c51a30654be30
+ dq 0xd192e819d6ef5218,0xd69906245565a910
+ dq 0xf40e35855771202a,0x106aa07032bbd1b8
+ dq 0x19a4c116b8d2d0c8,0x1e376c085141ab53
+ dq 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+ dq 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+ dq 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+ dq 0x748f82ee5defb2fc,0x78a5636f43172f60
+ dq 0x84c87814a1f0ab72,0x8cc702081a6439ec
+ dq 0x90befffa23631e28,0xa4506cebde82bde9
+ dq 0xbef9a3f7b2c67915,0xc67178f2e372532b
+ dq 0xca273eceea26619c,0xd186b8c721c0c207
+ dq 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+ dq 0x06f067aa72176fba,0x0a637dc5a2c898a6
+ dq 0x113f9804bef90dae,0x1b710b35131c471b
+ dq 0x28db77f523047d84,0x32caab7b40c72493
+ dq 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+ dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+ dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817
+
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh
new file mode 100755
index 000000000..5ec9ba3f3
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/nasm-filter.sh
@@ -0,0 +1,47 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+ case "$1" in
+ -f | -o | -D )
+ # Supported options with arg
+ options="$options $1 $2"
+ shift
+ shift
+ ;;
+ -I | -i )
+ options="$options $1 $2/"
+ shift
+ shift
+ ;;
+ --prefix* )
+ # Supported options without arg
+ options="$options $1"
+ shift
+ ;;
+ -I* | -i* )
+ options="$options $1/"
+ shift
+ ;;
+ -D* ) # For defines we need to remove spaces
+ case "$1" in
+ *' '* ) ;;
+ *) options="$options $1" ;;
+ esac
+ shift
+ ;;
+ #-blah )
+ # Unsupported options with args - none known
+ -* )
+ # Unsupported options with no args
+ shift
+ ;;
+ * )
+ args="$args $1"
+ shift
+ ;;
+ esac
+done
+
+nasm $options $args
diff --git a/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh b/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh
new file mode 100755
index 000000000..c33952a40
--- /dev/null
+++ b/src/crypto/isa-l/isa-l_crypto/tools/yasm-filter.sh
@@ -0,0 +1,38 @@
+#/bin/sh
+
+# Filter out unnecessary options added by automake
+
+while [ -n "$*" ]; do
+ case "$1" in
+ -f | -o | -I | -i | -D )
+ # Supported options with arg
+ options="$options $1 $2"
+ shift
+ shift
+ ;;
+ -I* | -i* | --prefix* )
+ # Supported options without arg
+ options="$options $1"
+ shift
+ ;;
+ -D* ) # For defines we need to remove spaces
+ case "$1" in
+ *' '* ) ;;
+ *) options="$options $1" ;;
+ esac
+ shift
+ ;;
+ #-blah )
+ # Unsupported options with args - none known
+ -* )
+ # Unsupported options with no args
+ shift
+ ;;
+ * )
+ args="$args $1"
+ shift
+ ;;
+ esac
+done
+
+yasm $options $args
diff --git a/src/crypto/isa-l/isal_crypto_accel.cc b/src/crypto/isa-l/isal_crypto_accel.cc
new file mode 100644
index 000000000..7dccf64fd
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_accel.cc
@@ -0,0 +1,43 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "crypto/isa-l/isal_crypto_accel.h"
+
+#include "crypto/isa-l/isa-l_crypto/include/aes_cbc.h"
+
+bool ISALCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+ alignas(16) struct cbc_key_data keys_blk;
+ aes_cbc_precomp(const_cast<unsigned char*>(&key[0]), AES_256_KEYSIZE, &keys_blk);
+ aes_cbc_enc_256(const_cast<unsigned char*>(in),
+ const_cast<unsigned char*>(&iv[0]), keys_blk.enc_keys, out, size);
+ return true;
+}
+bool ISALCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+ alignas(16) struct cbc_key_data keys_blk;
+ aes_cbc_precomp(const_cast<unsigned char*>(&key[0]), AES_256_KEYSIZE, &keys_blk);
+ aes_cbc_dec_256(const_cast<unsigned char*>(in), const_cast<unsigned char*>(&iv[0]), keys_blk.dec_keys, out, size);
+ return true;
+}
diff --git a/src/crypto/isa-l/isal_crypto_accel.h b/src/crypto/isa-l/isal_crypto_accel.h
new file mode 100644
index 000000000..84331bbdd
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_accel.h
@@ -0,0 +1,31 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef ISAL_CRYPTO_ACCEL_H
+#define ISAL_CRYPTO_ACCEL_H
+#include "crypto/crypto_accel.h"
+
+class ISALCryptoAccel : public CryptoAccel {
+ public:
+ ISALCryptoAccel() {}
+ virtual ~ISALCryptoAccel() {}
+
+ bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+ bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+};
+#endif
diff --git a/src/crypto/isa-l/isal_crypto_plugin.cc b/src/crypto/isa-l/isal_crypto_plugin.cc
new file mode 100644
index 000000000..85f0e5f0f
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_plugin.cc
@@ -0,0 +1,34 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczykd@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+
+// -----------------------------------------------------------------------------
+#include "crypto/isa-l/isal_crypto_plugin.h"
+
+#include "ceph_ver.h"
+// -----------------------------------------------------------------------------
+
+const char *__ceph_plugin_version()
+{
+ return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct,
+ const std::string& type,
+ const std::string& name)
+{
+ auto instance = cct->get_plugin_registry();
+
+ return instance->add(type, name, new ISALCryptoPlugin(cct));
+}
diff --git a/src/crypto/isa-l/isal_crypto_plugin.h b/src/crypto/isa-l/isal_crypto_plugin.h
new file mode 100644
index 000000000..68e782e69
--- /dev/null
+++ b/src/crypto/isa-l/isal_crypto_plugin.h
@@ -0,0 +1,47 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2016 Mirantis, Inc.
+ *
+ * Author: Adam Kupczyk <akupczyk@mirantis.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef ISAL_CRYPTO_PLUGIN_H
+#define ISAL_CRYPTO_PLUGIN_H
+// -----------------------------------------------------------------------------
+#include "crypto/crypto_plugin.h"
+#include "crypto/isa-l/isal_crypto_accel.h"
+#include "arch/intel.h"
+#include "arch/probe.h"
+// -----------------------------------------------------------------------------
+
+
+class ISALCryptoPlugin : public CryptoPlugin {
+
+public:
+
+ explicit ISALCryptoPlugin(CephContext* cct) : CryptoPlugin(cct)
+ {}
+ ~ISALCryptoPlugin()
+ {}
+ virtual int factory(CryptoAccelRef *cs,
+ std::ostream *ss)
+ {
+ if (cryptoaccel == nullptr)
+ {
+ ceph_arch_probe();
+ if (ceph_arch_intel_aesni && ceph_arch_intel_sse41) {
+ cryptoaccel = CryptoAccelRef(new ISALCryptoAccel);
+ }
+ }
+ *cs = cryptoaccel;
+ return 0;
+ }
+};
+#endif
diff --git a/src/crypto/openssl/CMakeLists.txt b/src/crypto/openssl/CMakeLists.txt
new file mode 100644
index 000000000..6ede1567f
--- /dev/null
+++ b/src/crypto/openssl/CMakeLists.txt
@@ -0,0 +1,14 @@
+## openssl
+
+set(openssl_crypto_plugin_srcs
+ openssl_crypto_accel.cc
+ openssl_crypto_plugin.cc)
+
+add_library(ceph_crypto_openssl SHARED ${openssl_crypto_plugin_srcs})
+target_link_libraries(ceph_crypto_openssl
+ PRIVATE OpenSSL::Crypto
+ $<$<PLATFORM_ID:Windows>:ceph-common>)
+target_include_directories(ceph_crypto_openssl PRIVATE ${OPENSSL_INCLUDE_DIR})
+add_dependencies(crypto_plugins ceph_crypto_openssl)
+set_target_properties(ceph_crypto_openssl PROPERTIES INSTALL_RPATH "")
+install(TARGETS ceph_crypto_openssl DESTINATION ${crypto_plugin_dir})
diff --git a/src/crypto/openssl/openssl_crypto_accel.cc b/src/crypto/openssl/openssl_crypto_accel.cc
new file mode 100644
index 000000000..e6ea0fa72
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_accel.cc
@@ -0,0 +1,104 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "crypto/openssl/openssl_crypto_accel.h"
+#include <openssl/evp.h>
+#include <openssl/engine.h>
+#include "common/debug.h"
+
+// -----------------------------------------------------------------------------
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_crypto
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static std::ostream&
+_prefix(std::ostream* _dout)
+{
+ return *_dout << "OpensslCryptoAccel: ";
+}
+// -----------------------------------------------------------------------------
+
+#define EVP_SUCCESS 1
+#define AES_ENCRYPT 1
+#define AES_DECRYPT 0
+
+bool evp_transform(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char* iv,
+ const unsigned char* key,
+ ENGINE* engine,
+ const EVP_CIPHER* const type,
+ const int encrypt)
+{
+ using pctx_t = std::unique_ptr<EVP_CIPHER_CTX, decltype(&::EVP_CIPHER_CTX_free)>;
+ pctx_t pctx{ EVP_CIPHER_CTX_new(), EVP_CIPHER_CTX_free };
+
+ if (!pctx) {
+ derr << "failed to create evp cipher context" << dendl;
+ return false;
+ }
+
+ if (EVP_CipherInit_ex(pctx.get(), type, engine, key, iv, encrypt) != EVP_SUCCESS) {
+ derr << "EVP_CipherInit_ex failed" << dendl;
+ return false;
+ }
+
+ if (EVP_CIPHER_CTX_set_padding(pctx.get(), 0) != EVP_SUCCESS) {
+ derr << "failed to disable PKCS padding" << dendl;
+ return false;
+ }
+
+ int len_update = 0;
+ if (EVP_CipherUpdate(pctx.get(), out, &len_update, in, size) != EVP_SUCCESS) {
+ derr << "EVP_CipherUpdate failed" << dendl;
+ return false;
+ }
+
+ int len_final = 0;
+ if (EVP_CipherFinal_ex(pctx.get(), out + len_update, &len_final) != EVP_SUCCESS) {
+ derr << "EVP_CipherFinal_ex failed" << dendl;
+ return false;
+ }
+
+ ceph_assert(len_final == 0);
+ return (len_update + len_final) == static_cast<int>(size);
+}
+
+bool OpenSSLCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+
+ return evp_transform(out, in, size, const_cast<unsigned char*>(&iv[0]),
+ const_cast<unsigned char*>(&key[0]),
+ nullptr, // Hardware acceleration engine can be used in the future
+ EVP_aes_256_cbc(), AES_ENCRYPT);
+}
+
+bool OpenSSLCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+
+ return evp_transform(out, in, size, const_cast<unsigned char*>(&iv[0]),
+ const_cast<unsigned char*>(&key[0]),
+ nullptr, // Hardware acceleration engine can be used in the future
+ EVP_aes_256_cbc(), AES_DECRYPT);
+}
diff --git a/src/crypto/openssl/openssl_crypto_accel.h b/src/crypto/openssl/openssl_crypto_accel.h
new file mode 100644
index 000000000..ad90cbece
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_accel.h
@@ -0,0 +1,32 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef OPENSSL_CRYPTO_ACCEL_H
+#define OPENSSL_CRYPTO_ACCEL_H
+
+#include "crypto/crypto_accel.h"
+
+class OpenSSLCryptoAccel : public CryptoAccel {
+ public:
+ OpenSSLCryptoAccel() {}
+ virtual ~OpenSSLCryptoAccel() {}
+
+ bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+ bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+};
+#endif
diff --git a/src/crypto/openssl/openssl_crypto_plugin.cc b/src/crypto/openssl/openssl_crypto_plugin.cc
new file mode 100644
index 000000000..e6ecea2fd
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_plugin.cc
@@ -0,0 +1,32 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+
+#include "crypto/openssl/openssl_crypto_plugin.h"
+
+#include "ceph_ver.h"
+
+const char *__ceph_plugin_version()
+{
+ return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct,
+ const std::string& type,
+ const std::string& name)
+{
+ auto instance = cct->get_plugin_registry();
+
+ return instance->add(type, name, new OpenSSLCryptoPlugin(cct));
+}
diff --git a/src/crypto/openssl/openssl_crypto_plugin.h b/src/crypto/openssl/openssl_crypto_plugin.h
new file mode 100644
index 000000000..408d9ebda
--- /dev/null
+++ b/src/crypto/openssl/openssl_crypto_plugin.h
@@ -0,0 +1,36 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef ISAL_CRYPTO_PLUGIN_H
+#define ISAL_CRYPTO_PLUGIN_H
+
+#include "crypto/crypto_plugin.h"
+#include "crypto/openssl/openssl_crypto_accel.h"
+
+
+class OpenSSLCryptoPlugin : public CryptoPlugin {
+
+ CryptoAccelRef cryptoaccel;
+public:
+ explicit OpenSSLCryptoPlugin(CephContext* cct) : CryptoPlugin(cct)
+ {}
+ int factory(CryptoAccelRef *cs, std::ostream *ss) override {
+ if (cryptoaccel == nullptr)
+ cryptoaccel = CryptoAccelRef(new OpenSSLCryptoAccel);
+
+ *cs = cryptoaccel;
+ return 0;
+ }
+};
+#endif
diff --git a/src/crypto/qat/CMakeLists.txt b/src/crypto/qat/CMakeLists.txt
new file mode 100644
index 000000000..ea4c1e25d
--- /dev/null
+++ b/src/crypto/qat/CMakeLists.txt
@@ -0,0 +1,25 @@
+##
+# QAT wrapper for Ceph
+##
+
+# Build QAT driver library first
+include(BuildQatDrv)
+
+set(qat_crypto_plugin_srcs
+ qat_crypto_accel.cc
+ qat_crypto_plugin.cc
+ qcccrypto.cc)
+
+add_library(ceph_crypto_qat SHARED ${qat_crypto_plugin_srcs})
+add_dependencies(ceph_crypto_qat QatDrv)
+target_include_directories(ceph_crypto_qat PRIVATE
+ ${EXTRA_LIBS}
+ ${QatDrv_INCLUDE_DIRS})
+
+add_dependencies(crypto_plugins ceph_crypto_qat)
+
+target_link_libraries(ceph_crypto_qat
+ ${QatDrv_LIBRARIES})
+add_dependencies(crypto_plugins ceph_crypto_qat)
+set_target_properties(ceph_crypto_qat PROPERTIES VERSION 1.0.0 SOVERSION 1)
+install(TARGETS ceph_crypto_qat DESTINATION ${crypto_plugin_dir})
diff --git a/src/crypto/qat/qat_crypto_accel.cc b/src/crypto/qat/qat_crypto_accel.cc
new file mode 100644
index 000000000..23f86edfa
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_accel.cc
@@ -0,0 +1,42 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#include "crypto/qat/qat_crypto_accel.h"
+
+bool QccCryptoAccel::cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+
+ return qcccrypto.perform_op(out, in, size,
+ const_cast<unsigned char *>(&iv[0]),
+ const_cast<unsigned char *>(&key[0]), CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT);
+}
+
+bool QccCryptoAccel::cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE])
+{
+ if ((size % AES_256_IVSIZE) != 0) {
+ return false;
+ }
+
+ return qcccrypto.perform_op(out, in, size,
+ const_cast<unsigned char *>(&iv[0]),
+ const_cast<unsigned char *>(&key[0]), CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT);
+}
diff --git a/src/crypto/qat/qat_crypto_accel.h b/src/crypto/qat/qat_crypto_accel.h
new file mode 100644
index 000000000..5badefc28
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_accel.h
@@ -0,0 +1,35 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef QAT_CRYPTO_ACCEL_H
+#define QAT_CRYPTO_ACCEL_H
+
+#include "crypto/crypto_accel.h"
+#include "crypto/qat/qcccrypto.h"
+
+class QccCryptoAccel : public CryptoAccel {
+ public:
+ QccCrypto qcccrypto;
+ QccCryptoAccel() { qcccrypto.init(); };
+ ~QccCryptoAccel() { qcccrypto.destroy(); };
+
+ bool cbc_encrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+ bool cbc_decrypt(unsigned char* out, const unsigned char* in, size_t size,
+ const unsigned char (&iv)[AES_256_IVSIZE],
+ const unsigned char (&key)[AES_256_KEYSIZE]) override;
+};
+#endif
diff --git a/src/crypto/qat/qat_crypto_plugin.cc b/src/crypto/qat/qat_crypto_plugin.cc
new file mode 100644
index 000000000..4bf3d61bb
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_plugin.cc
@@ -0,0 +1,35 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+
+#include "crypto/qat/qat_crypto_plugin.h"
+
+#include "ceph_ver.h"
+
+std::mutex QccCryptoPlugin::qat_init;
+
+const char *__ceph_plugin_version()
+{
+ return CEPH_GIT_NICE_VER;
+}
+
+int __ceph_plugin_init(CephContext *cct,
+ const std::string& type,
+ const std::string& name)
+{
+ PluginRegistry *instance = cct->get_plugin_registry();
+
+ return instance->add(type, name, new QccCryptoPlugin(cct));
+}
diff --git a/src/crypto/qat/qat_crypto_plugin.h b/src/crypto/qat/qat_crypto_plugin.h
new file mode 100644
index 000000000..736f14706
--- /dev/null
+++ b/src/crypto/qat/qat_crypto_plugin.h
@@ -0,0 +1,43 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author: Qiaowei Ren <qiaowei.ren@intel.com>
+ * Author: Ganesh Mahalingam <ganesh.mahalingam@intel.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef QAT_CRYPTO_PLUGIN_H
+#define QAT_CRYPTO_PLUGIN_H
+
+#include "crypto/crypto_plugin.h"
+#include "crypto/qat/qat_crypto_accel.h"
+
+
+class QccCryptoPlugin : public CryptoPlugin {
+ static std::mutex qat_init;
+
+public:
+
+ explicit QccCryptoPlugin(CephContext* cct) : CryptoPlugin(cct)
+ {}
+ ~QccCryptoPlugin()
+ {}
+ virtual int factory(CryptoAccelRef *cs,
+ ostream *ss)
+ {
+ std::lock_guard<std::mutex> l(qat_init);
+ if (cryptoaccel == nullptr)
+ cryptoaccel = CryptoAccelRef(new QccCryptoAccel);
+
+ *cs = cryptoaccel;
+ return 0;
+ }
+};
+#endif
diff --git a/src/crypto/qat/qcccrypto.cc b/src/crypto/qat/qcccrypto.cc
new file mode 100644
index 000000000..410727d3b
--- /dev/null
+++ b/src/crypto/qat/qcccrypto.cc
@@ -0,0 +1,469 @@
+#include "qcccrypto.h"
+#include <iostream>
+#include "string.h"
+#include <pthread.h>
+#include "common/debug.h"
+#include "include/scope_guard.h"
+
+// -----------------------------------------------------------------------------
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_rgw
+#undef dout_prefix
+#define dout_prefix _prefix(_dout)
+
+static ostream&
+_prefix(std::ostream* _dout)
+{
+ return *_dout << "QccCrypto: ";
+}
+// -----------------------------------------------------------------------------
+
+/*
+ * Poller thread & functions
+*/
+static std::mutex qcc_alloc_mutex;
+static std::mutex qcc_eng_mutex;
+static std::atomic<bool> init_called = { false };
+
+void* QccCrypto::crypt_thread(void *args) {
+ struct qcc_thread_args *thread_args = (struct qcc_thread_args *)args;
+ thread_args->qccinstance->do_crypt(thread_args);
+}
+
+void QccCrypto::QccFreeInstance(int entry) {
+ std::lock_guard<std::mutex> freeinst(qcc_alloc_mutex);
+ open_instances.push(entry);
+}
+
+int QccCrypto::QccGetFreeInstance() {
+ int ret = -1;
+ std::lock_guard<std::mutex> getinst(qcc_alloc_mutex);
+ if (!open_instances.empty()) {
+ ret = open_instances.front();
+ open_instances.pop();
+ }
+ return ret;
+}
+
+void QccCrypto::cleanup() {
+ icp_sal_userStop();
+ qaeMemDestroy();
+ is_init = false;
+ init_stat = stat;
+ init_called = false;
+ derr << "Failure during QAT init sequence. Quitting" << dendl;
+}
+
+/*
+ * We initialize QAT instance and everything that is common for all ops
+*/
+bool QccCrypto::init()
+{
+
+ std::lock_guard<std::mutex> l(qcc_eng_mutex);
+
+ if(init_called) {
+ dout(10) << "Init sequence already called. Skipping duplicate call" << dendl;
+ return true;
+ }
+
+ // First call to init
+ dout(15) << "First init for QAT" << dendl;
+ init_called = true;
+
+ // Find if the usermode memory driver is available. We need to this to
+ // create contiguous memory needed by QAT.
+ stat = qaeMemInit();
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to load memory driver" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ stat = icp_sal_userStart("CEPH");
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to start qat device" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ qcc_os_mem_alloc((void **)&qcc_inst, sizeof(QCCINST));
+ if(qcc_inst == NULL) {
+ derr << "Unable to alloc mem for instance struct" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ // Initialize contents of qcc_inst
+ qcc_inst->num_instances = 0;
+ qcc_inst->cy_inst_handles = NULL;
+
+ stat = cpaCyGetNumInstances(&(qcc_inst->num_instances));
+ if ((stat != CPA_STATUS_SUCCESS) || (qcc_inst->num_instances <= 0)) {
+ derr << "Unable to find available instances" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ qcc_os_mem_alloc((void **)&qcc_inst->cy_inst_handles,
+ ((int)qcc_inst->num_instances * sizeof(CpaInstanceHandle)));
+ if (qcc_inst->cy_inst_handles == NULL) {
+ derr << "Unable to allocate instances array memory" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ stat = cpaCyGetInstances(qcc_inst->num_instances, qcc_inst->cy_inst_handles);
+ if (stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to get instances" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ int iter = 0;
+ //Start Instances
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ stat = cpaCyStartInstance(qcc_inst->cy_inst_handles[iter]);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to start instance" << dendl;
+ this->cleanup();
+ return false;
+ }
+ }
+
+ qcc_os_mem_alloc((void **)&qcc_inst->is_polled,
+ ((int)qcc_inst->num_instances * sizeof(CpaBoolean)));
+ CpaInstanceInfo2 info;
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ qcc_inst->is_polled[iter] = cpaCyInstanceGetInfo2(qcc_inst->cy_inst_handles[iter],
+ &info) == CPA_STATUS_SUCCESS ? info.isPolled : CPA_FALSE;
+ }
+
+ // Allocate memory structures for all instances
+ qcc_os_mem_alloc((void **)&qcc_sess,
+ ((int)qcc_inst->num_instances * sizeof(QCCSESS)));
+ if(qcc_sess == NULL) {
+ derr << "Unable to allocate memory for session struct" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ qcc_os_mem_alloc((void **)&qcc_op_mem,
+ ((int)qcc_inst->num_instances * sizeof(QCCOPMEM)));
+ if(qcc_sess == NULL) {
+ derr << "Unable to allocate memory for opmem struct" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ qcc_os_mem_alloc((void **)&cypollthreads,
+ ((int)qcc_inst->num_instances * sizeof(pthread_t)));
+ if(cypollthreads == NULL) {
+ derr << "Unable to allocate memory for pthreads" << dendl;
+ this->cleanup();
+ return false;
+ }
+
+ //At this point we are only doing an user-space version.
+ //To-Do: Maybe a kernel based one
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ stat = cpaCySetAddressTranslation(qcc_inst->cy_inst_handles[iter],
+ qaeVirtToPhysNUMA);
+ if(stat == CPA_STATUS_SUCCESS) {
+ // Start HW Polling Thread
+ // To-Do: Enable epoll & interrupt based later?
+ // QccCyStartPoll(iter);
+ // Setup the session structures for crypto operation and populate
+ // whatever we can now. Rest will be filled in when crypto operation
+ // happens.
+ qcc_sess[iter].sess_ctx_sz = 0;
+ qcc_sess[iter].sess_ctx = NULL;
+ qcc_sess[iter].sess_stp_data.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+ qcc_sess[iter].sess_stp_data.symOperation = CPA_CY_SYM_OP_CIPHER;
+ open_instances.push(iter);
+ qcc_op_mem[iter].is_mem_alloc = false;
+ qcc_op_mem[iter].op_complete = false;
+ qcc_op_mem[iter].op_result = CPA_STATUS_SUCCESS;
+ qcc_op_mem[iter].sym_op_data = NULL;
+ qcc_op_mem[iter].buff_meta_size = qcc_op_mem[iter].buff_size = 0;
+ qcc_op_mem[iter].src_buff_meta = qcc_op_mem[iter].src_buff
+ = qcc_op_mem[iter].iv_buff = NULL;
+ qcc_op_mem[iter].src_buff_list = NULL;
+ qcc_op_mem[iter].src_buff_flat = NULL;
+ qcc_op_mem[iter].num_buffers = 1;
+ } else {
+ derr << "Unable to find address translations of instance " << iter << dendl;
+ this->cleanup();
+ return false;
+ }
+ }
+ is_init = true;
+ dout(10) << "Init complete" << dendl;
+ return true;
+}
+
+bool QccCrypto::destroy() {
+ if((!is_init) || (!init_called)) {
+ dout(15) << "QAT not initialized here. Nothing to do" << dendl;
+ return false;
+ }
+
+ unsigned int retry = 0;
+ while(retry <= QCC_MAX_RETRIES) {
+ if(open_instances.size() == qcc_inst->num_instances) {
+ break;
+ } else {
+ retry++;
+ }
+ dout(5) << "QAT is still busy and cannot free resources yet" << dendl;
+ return false;
+ }
+
+ dout(10) << "Destroying QAT crypto & related memory" << dendl;
+ int iter = 0;
+
+ // Free up op related memory
+ for (iter =0; iter < qcc_inst->num_instances; iter++) {
+ qcc_contig_mem_free((void **)&(qcc_op_mem[iter].src_buff));
+ qcc_contig_mem_free((void **)&(qcc_op_mem[iter].iv_buff));
+ qcc_os_mem_free((void **)&(qcc_op_mem[iter].src_buff_list));
+ qcc_os_mem_free((void **)&(qcc_op_mem[iter].src_buff_flat));
+ qcc_contig_mem_free((void **)&(qcc_op_mem[iter].sym_op_data));
+ }
+
+ // Free up Session memory
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ cpaCySymRemoveSession(qcc_inst->cy_inst_handles[iter], qcc_sess[iter].sess_ctx);
+ qcc_contig_mem_free((void **)&(qcc_sess[iter].sess_ctx));
+ }
+
+ // Stop QAT Instances
+ for(iter = 0; iter < qcc_inst->num_instances; iter++) {
+ cpaCyStopInstance(qcc_inst->cy_inst_handles[iter]);
+ }
+
+ // Free up the base structures we use
+ qcc_os_mem_free((void **)&qcc_op_mem);
+ qcc_os_mem_free((void **)&qcc_sess);
+ qcc_os_mem_free((void **)&(qcc_inst->cy_inst_handles));
+ qcc_os_mem_free((void **)&(qcc_inst->is_polled));
+ qcc_os_mem_free((void **)&cypollthreads);
+ qcc_os_mem_free((void **)&qcc_inst);
+
+ //Un-init memory driver and QAT HW
+ icp_sal_userStop();
+ qaeMemDestroy();
+ init_called = false;
+ is_init = false;
+ return true;
+}
+
+void QccCrypto::do_crypt(qcc_thread_args *thread_args) {
+ auto entry = thread_args->entry;
+ qcc_op_mem[entry].op_result = cpaCySymPerformOp(qcc_inst->cy_inst_handles[entry],
+ NULL,
+ qcc_op_mem[entry].sym_op_data,
+ qcc_op_mem[entry].src_buff_list,
+ qcc_op_mem[entry].src_buff_list,
+ NULL);
+ qcc_op_mem[entry].op_complete = true;
+ free(thread_args);
+}
+
+bool QccCrypto::perform_op(unsigned char* out, const unsigned char* in,
+ size_t size, uint8_t *iv, uint8_t *key, CpaCySymCipherDirection op_type)
+{
+ if (!init_called) {
+ dout(10) << "QAT not intialized yet. Initializing now..." << dendl;
+ if(!QccCrypto::init()) {
+ derr << "QAT init failed" << dendl;
+ return false;
+ }
+ }
+
+ if(!is_init)
+ {
+ dout(10) << "QAT not initialized in this instance or init failed with possible error " << (int)init_stat << dendl;
+ return is_init;
+ }
+
+ int avail_inst = -1;
+ unsigned int retrycount = 0;
+ while(retrycount <= QCC_MAX_RETRIES) {
+ avail_inst = QccGetFreeInstance();
+ if(avail_inst != -1) {
+ break;
+ } else {
+ retrycount++;
+ usleep(qcc_sleep_duration);
+ }
+ }
+
+ if(avail_inst == -1) {
+ derr << "Unable to get an QAT instance. Failing request" << dendl;
+ return false;
+ }
+
+ dout(15) << "Using inst " << avail_inst << dendl;
+ // Start polling threads for this instance
+ //QccCyStartPoll(avail_inst);
+
+ auto sg = make_scope_guard([=] {
+ //free up the instance irrespective of the op status
+ dout(15) << "Completed task under " << avail_inst << dendl;
+ qcc_op_mem[avail_inst].op_complete = false;
+ QccCrypto::QccFreeInstance(avail_inst);
+ });
+
+ /*
+ * Allocate buffers for this version of the instance if not already done.
+ * Hold onto to most of them until destructor is called.
+ */
+ if (qcc_op_mem[avail_inst].is_mem_alloc == false) {
+
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherAlgorithm =
+ CPA_CY_SYM_CIPHER_AES_CBC;
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherKeyLenInBytes =
+ AES_256_KEY_SIZE;
+
+ // Allocate contig memory for buffers that are independent of the
+ // input/output
+ stat = cpaCyBufferListGetMetaSize(qcc_inst->cy_inst_handles[avail_inst],
+ qcc_op_mem[avail_inst].num_buffers, &(qcc_op_mem[avail_inst].buff_meta_size));
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to get buff meta size" << dendl;
+ return false;
+ }
+
+ // Allocate Buffer List Private metadata
+ stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_meta),
+ qcc_op_mem[avail_inst].buff_meta_size, 1);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate private metadata memory" << dendl;
+ return false;
+ }
+
+ // Allocate Buffer List Memory
+ qcc_os_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_list), sizeof(CpaBufferList));
+ qcc_os_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff_flat),
+ (qcc_op_mem[avail_inst].num_buffers * sizeof(CpaFlatBuffer)));
+ if(qcc_op_mem[avail_inst].src_buff_list == NULL || qcc_op_mem[avail_inst].src_buff_flat == NULL) {
+ derr << "Unable to allocate bufferlist memory" << dendl;
+ return false;
+ }
+
+ // Allocate IV memory
+ stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].iv_buff), AES_256_IV_LEN);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate bufferlist memory" << dendl;
+ return false;
+ }
+
+ //Assign src stuff for the operation
+ (qcc_op_mem[avail_inst].src_buff_list)->pBuffers = qcc_op_mem[avail_inst].src_buff_flat;
+ (qcc_op_mem[avail_inst].src_buff_list)->numBuffers = qcc_op_mem[avail_inst].num_buffers;
+ (qcc_op_mem[avail_inst].src_buff_list)->pPrivateMetaData = qcc_op_mem[avail_inst].src_buff_meta;
+
+ //Setup OpData
+ stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].sym_op_data),
+ sizeof(CpaCySymOpData));
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate opdata memory" << dendl;
+ return false;
+ }
+
+ // Assuming op to be encryption for initiation. This will be reset when we
+ // exit this block
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherDirection =
+ CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT;
+ // Allocate Session memory
+ stat = cpaCySymSessionCtxGetSize(qcc_inst->cy_inst_handles[avail_inst],
+ &(qcc_sess[avail_inst].sess_stp_data), &(qcc_sess[avail_inst].sess_ctx_sz));
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to find session size" << dendl;
+ return false;
+ }
+
+ stat = qcc_contig_mem_alloc((void **)&(qcc_sess[avail_inst].sess_ctx),
+ qcc_sess[avail_inst].sess_ctx_sz);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate contig memory" << dendl;
+ return false;
+ }
+
+ // Set memalloc flag so that we don't go through this exercise again.
+ qcc_op_mem[avail_inst].is_mem_alloc = true;
+ dout(15) << "Instantiation complete for " << avail_inst << dendl;
+ }
+
+ // Section that runs on every call
+ // Identify the operation and assign to session
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.cipherDirection = op_type;
+ qcc_sess[avail_inst].sess_stp_data.cipherSetupData.pCipherKey = (Cpa8U *)key;
+
+ stat = cpaCySymInitSession(qcc_inst->cy_inst_handles[avail_inst],
+ NULL,
+ &(qcc_sess[avail_inst].sess_stp_data),
+ qcc_sess[avail_inst].sess_ctx);
+ if (stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to init session" << dendl;
+ return false;
+ }
+
+ // Allocate actual buffers that will hold data
+ if (qcc_op_mem[avail_inst].buff_size != (Cpa32U)size) {
+ qcc_contig_mem_free((void **)&(qcc_op_mem[avail_inst].src_buff));
+ qcc_op_mem[avail_inst].buff_size = (Cpa32U)size;
+ stat = qcc_contig_mem_alloc((void **)&(qcc_op_mem[avail_inst].src_buff),
+ qcc_op_mem[avail_inst].buff_size);
+ if(stat != CPA_STATUS_SUCCESS) {
+ derr << "Unable to allocate contig memory" << dendl;
+ return false;
+ }
+ }
+
+ // Copy src & iv into the respective buffers
+ memcpy(qcc_op_mem[avail_inst].src_buff, in, size);
+ memcpy(qcc_op_mem[avail_inst].iv_buff, iv, AES_256_IV_LEN);
+
+ //Assign the reminder of the stuff
+ qcc_op_mem[avail_inst].src_buff_flat->dataLenInBytes = qcc_op_mem[avail_inst].buff_size;
+ qcc_op_mem[avail_inst].src_buff_flat->pData = qcc_op_mem[avail_inst].src_buff;
+
+ //OpData assignment
+ qcc_op_mem[avail_inst].sym_op_data->sessionCtx = qcc_sess[avail_inst].sess_ctx;
+ qcc_op_mem[avail_inst].sym_op_data->packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+ qcc_op_mem[avail_inst].sym_op_data->pIv = qcc_op_mem[avail_inst].iv_buff;
+ qcc_op_mem[avail_inst].sym_op_data->ivLenInBytes = AES_256_IV_LEN;
+ qcc_op_mem[avail_inst].sym_op_data->cryptoStartSrcOffsetInBytes = 0;
+ qcc_op_mem[avail_inst].sym_op_data->messageLenToCipherInBytes = qcc_op_mem[avail_inst].buff_size;
+
+ // Perform cipher operation in a thread
+ qcc_thread_args* thread_args = new qcc_thread_args();
+ thread_args->qccinstance = this;
+ thread_args->entry = avail_inst;
+
+ if (pthread_create(&cypollthreads[avail_inst], NULL, crypt_thread, (void *)thread_args) != 0) {
+ derr << "Unable to create thread for crypt operation" << dendl;
+ return false;
+ }
+ if (qcc_inst->is_polled[avail_inst] == CPA_TRUE) {
+ while (!qcc_op_mem[avail_inst].op_complete) {
+ icp_sal_CyPollInstance(qcc_inst->cy_inst_handles[avail_inst], 0);
+ }
+ }
+ pthread_join(cypollthreads[avail_inst], NULL);
+
+ if(qcc_op_mem[avail_inst].op_result != CPA_STATUS_SUCCESS) {
+ derr << "Unable to perform crypt operation" << dendl;
+ return false;
+ }
+
+ //Copy data back to out buffer
+ memcpy(out, qcc_op_mem[avail_inst].src_buff, size);
+ //Always cleanup memory holding user-data at the end
+ memset(qcc_op_mem[avail_inst].iv_buff, 0, AES_256_IV_LEN);
+ memset(qcc_op_mem[avail_inst].src_buff, 0, qcc_op_mem[avail_inst].buff_size);
+
+ return true;
+}
diff --git a/src/crypto/qat/qcccrypto.h b/src/crypto/qat/qcccrypto.h
new file mode 100644
index 000000000..a36b0898b
--- /dev/null
+++ b/src/crypto/qat/qcccrypto.h
@@ -0,0 +1,176 @@
+#ifndef QCCCRYPTO_H
+#define QCCCRYPTO_H
+
+#include <atomic>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <queue>
+extern "C" {
+#include "cpa.h"
+#include "lac/cpa_cy_sym.h"
+#include "lac/cpa_cy_im.h"
+#include "qae_mem.h"
+#include "icp_sal_user.h"
+#include "icp_sal_poll.h"
+#include "qae_mem_utils.h"
+}
+
+class QccCrypto {
+
+ public:
+ CpaCySymCipherDirection qcc_op_type;
+
+ QccCrypto() {};
+ ~QccCrypto() {};
+
+ bool init();
+ bool destroy();
+ bool perform_op(unsigned char* out, const unsigned char* in, size_t size,
+ uint8_t *iv,
+ uint8_t *key,
+ CpaCySymCipherDirection op_type);
+
+ private:
+
+ // Currently only supporting AES_256_CBC.
+ // To-Do: Needs to be expanded
+ static const size_t AES_256_IV_LEN = 16;
+ static const size_t AES_256_KEY_SIZE = 32;
+ static const size_t QCC_MAX_RETRIES = 5000;
+
+ /*
+ * Struct to hold an instance of QAT to handle the crypto operations. These
+ * will be identified at the start and held until the destructor is called
+ * To-Do:
+ * The struct was creating assuming that we will use all the instances.
+ * Expand current implementation to allow multiple instances to operate
+ * independently.
+ */
+ struct QCCINST {
+ CpaInstanceHandle *cy_inst_handles;
+ CpaBoolean *is_polled;
+ Cpa16U num_instances;
+ } *qcc_inst;
+
+ /*
+ * QAT Crypto Session
+ * Crypto Session Context and setupdata holds
+ * priority, type of crypto operation (cipher/chained),
+ * cipher algorithm (AES, DES, etc),
+ * single crypto or multi-buffer crypto.
+ */
+ struct QCCSESS {
+ CpaCySymSessionSetupData sess_stp_data;
+ Cpa32U sess_ctx_sz;
+ CpaCySymSessionCtx sess_ctx;
+ } *qcc_sess;
+
+ /*
+ * Cipher Memory Allocations
+ * Holds bufferlist, flatbuffer, cipher opration data and buffermeta needed
+ * by QAT to perform the operation. Also buffers for IV, SRC, DEST.
+ */
+ struct QCCOPMEM {
+ // Op common items
+ bool is_mem_alloc;
+ bool op_complete;
+ CpaStatus op_result;
+ CpaCySymOpData *sym_op_data;
+ Cpa32U buff_meta_size;
+ Cpa32U num_buffers;
+ Cpa32U buff_size;
+
+ //Src data items
+ Cpa8U *src_buff_meta;
+ CpaBufferList *src_buff_list;
+ CpaFlatBuffer *src_buff_flat;
+ Cpa8U *src_buff;
+ Cpa8U *iv_buff;
+ } *qcc_op_mem;
+
+ //QAT HW polling thread input structure
+ struct qcc_thread_args {
+ QccCrypto* qccinstance;
+ int entry;
+ };
+
+
+ /*
+ * Function to handle the crypt operation. Will run while the main thread
+ * runs the polling function on the instance doing the op
+ */
+ void do_crypt(qcc_thread_args *thread_args);
+
+ /*
+ * Handle queue with free instances to handle op
+ */
+ std::queue<int> open_instances;
+ int QccGetFreeInstance();
+ void QccFreeInstance(int entry);
+
+ /*
+ * Contiguous Memory Allocator and de-allocator. We are using the usdm
+ * driver that comes along with QAT to get us direct memory access using
+ * hugepages.
+ * To-Do: A kernel based one.
+ */
+ static inline void qcc_contig_mem_free(void **ptr) {
+ if (*ptr) {
+ qaeMemFreeNUMA(ptr);
+ *ptr = NULL;
+ }
+ }
+
+ static inline CpaStatus qcc_contig_mem_alloc(void **ptr, Cpa32U size, Cpa32U alignment = 1) {
+ *ptr = qaeMemAllocNUMA(size, 0, alignment);
+ if (NULL == *ptr)
+ {
+ return CPA_STATUS_RESOURCE;
+ }
+ return CPA_STATUS_SUCCESS;
+ }
+
+ /*
+ * Malloc & free calls masked to maintain consistency and future kernel
+ * alloc support.
+ */
+ static inline void qcc_os_mem_free(void **ptr) {
+ if (*ptr) {
+ free(*ptr);
+ *ptr = NULL;
+ }
+ }
+
+ static inline CpaStatus qcc_os_mem_alloc(void **ptr, Cpa32U size) {
+ *ptr = malloc(size);
+ if (*ptr == NULL)
+ {
+ return CPA_STATUS_RESOURCE;
+ }
+ return CPA_STATUS_SUCCESS;
+ }
+
+ std::atomic<bool> is_init = { false };
+ CpaStatus init_stat, stat;
+
+ /*
+ * Function to cleanup memory if constructor fails
+ */
+ void cleanup();
+
+ /*
+ * Crypto Polling Function & helpers
+ * This helps to retrieve data from the QAT rings and dispatching the
+ * associated callbacks. For synchronous operation (like this one), QAT
+ * library creates an internal callback for the operation.
+ */
+ static void* crypt_thread(void* entry);
+ CpaStatus QccCyStartPoll(int entry);
+ void poll_instance(int entry);
+
+ pthread_t *cypollthreads;
+ static const size_t qcc_sleep_duration = 2;
+};
+#endif //QCCCRYPTO_H